diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION
index d8b37f0..79d5c17 100644
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@@ -1 +1 @@
-6.4.12_CL562241A_D561555_A558512_R558399_T558462_Oeb44e5c
+6.4.14_CL650117A_D650117_A648302_R647402_T648811_O646970
\ No newline at end of file
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
index 8b93beb..d353960 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@@ -501,6 +501,8 @@ enum vx_kernel_e {
 
     VX_KERNEL_STREAM_PROCESSOR = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x32,
 
+    VX_KERNEL_NN_BATCH_GEMM_RELU_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x33,
+
     VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };
 
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
index 6cf283c..f3f0191 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@@ -173,7 +173,7 @@ VX_DECONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support d
  1: support
 */
 #ifndef VX_DECONV_3D_API_SUPPORT
-#define VX_DECONV_3D_API_SUPPORT 0
+#define VX_DECONV_3D_API_SUPPORT 1
 #endif
 
 /*
@@ -237,4 +237,26 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor
 #define VX_SCALE_EXTRA_PARAMETER_SUPPORT 1
 #endif
 
+/*
+ VX_INVALIDATE_HANDLE_SUPPORT is used to declare that we refined vxSwapTensorHandle API to follow KHR OpenVX 1.3 spec: tensor don't maintain handle internally if new_ptr is NULL.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_INVALIDATE_HANDLE_SUPPORT
+#define VX_INVALIDATE_HANDLE_SUPPORT 1
+#endif
+
+/*
+ VX_ACTIVATION_EXT2_SUPPORT is used to declare that ACTIVATION can support sign, hard_sigmoid, neg, clip, exp, sin, cos,
+ log, mish, gelu, hgelu, elu, selu, celu, rcp, softsign, atan, atanh, acosh, inverse sigmoid, round and erf.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_ACTIVATION_EXT2_SUPPORT
+#define VX_ACTIVATION_EXT2_SUPPORT 1
+#endif
+
+
 #endif /* __VX_KHR_COMPATIBLE_H__ */
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
index c329f8c..a43a37e 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@@ -219,6 +219,28 @@ enum vx_nn_activation_function_e
     VX_NN_ACTIVATION_HSWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x6,
     VX_NN_ACTIVATION_CUSTOM = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
     VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x8,
+    VX_NN_ACTIVATION_SIGN_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x9,
+    VX_NN_ACTIVATION_HSIGMOID_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xa,
+    VX_NN_ACTIVATION_NEG_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xb,
+    VX_NN_ACTIVATION_CLIP_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xc,
+    VX_NN_ACTIVATION_EXP_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xd,
+    VX_NN_ACTIVATION_SIN_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xe,
+    VX_NN_ACTIVATION_COS_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0xf,
+    VX_NN_ACTIVATION_LOG_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x10,
+    VX_NN_ACTIVATION_MISH_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x11,
+    VX_NN_ACTIVATION_GELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x12,
+    VX_NN_ACTIVATION_HGELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x13,
+    VX_NN_ACTIVATION_ELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x14,
+    VX_NN_ACTIVATION_SELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x15,
+    VX_NN_ACTIVATION_CELU_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x16,
+    VX_NN_ACTIVATION_RECIPROCAL_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x17,
+    VX_NN_ACTIVATION_SOFTSIGN_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x18,
+    VX_NN_ACTIVATION_ATAN_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x19,
+    VX_NN_ACTIVATION_ATANH_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1a,
+    VX_NN_ACTIVATION_ACOSH_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1b,
+    VX_NN_ACTIVATION_INVERSE_SIGMOID_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1c,
+    VX_NN_ACTIVATION_ROUND_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1d,
+    VX_NN_ACTIVATION_ERF_VSI = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x1e,
 };
 
 /*! \brief  The Convolutional network type
@@ -623,6 +645,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromHandle2(
 * \retval VX_ERROR_INVALID_REFERENCE tensor is not a valid <tt>\ref vx_tensor</tt> <tt>\ref vx_image</tt>reference created from Handle.
 */
 VX_API_ENTRY vx_status VX_API_CALL vxFlushHandle(vx_reference ref);
+/* !\brief Same as vxFlushHandle() also added by Verisilicon as extension API.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxFlushHandleVSI(vx_reference ref);
+
+#if defined(VX_INVALIDATE_HANDLE_SUPPORT) && VX_INVALIDATE_HANDLE_SUPPORT
+/*! \brief Invalidate the memory referenced by reference's handle when it is ready.
+* added by Versilicon as extension API.
+* \param [in] ref The reference(image or tensor) which created from handle.
+* \return A <tt>\ref vx_status_e</tt> enumeration.;
+* \retval VX_ERROR_INVALID_REFERENCE tensor is not a valid <tt>\ref vx_tensor</tt> <tt>\ref vx_image</tt>reference created from Handle.
+*/
+VX_API_ENTRY vx_status VX_API_CALL vxInvalidateHandleVSI(vx_reference ref);
+#endif
 
 #if VX_VA40_EXT_SUPPORT
 /*! \brief Return a new tensor referencing the same memory location but with different shape.
@@ -776,6 +811,14 @@ typedef struct _vx_nn_convolution_params_ext2_t
 
     vx_int32 depth_multiplier;               /*!< \brief depthwise multiplier value, if 0, means convolution, elsewise(>=1), the convolution is depthwiseconvolution. */
 } vx_nn_convolution_params_ext2_t;
+
+typedef struct _vx_nn_convolution_params_ext3_t
+{
+    vx_nn_convolution_params_ext2_t ext2;      /*!< \brief Convolution extension structure head */
+
+    vx_bool isPPU;                            /*!< \brief  merge convolution and relu for PPU.  */
+} vx_nn_convolution_params_ext3_t;
+
 /*==============================================================================
     NN Nodes
 =============================================================================*/
@@ -2142,7 +2185,8 @@ typedef struct _vx_hardware_caps_params_ext_t
 typedef struct _vx_hardware_caps_params_ext2_t
 {
     vx_hardware_caps_params_ext_t base;
-    vx_uint32 streamProcessorExecCount;     /*!< \brief  streamprocess execution count.  */
+    vx_uint32 streamProcessorExecCount;     /*!< \brief  stream processor execution count.  */
+    vx_uint32 streamProcessorVectorSize;    /*!< \brief  stream processor vector size.  */
 } vx_hardware_caps_params_ext2_t;
 
 /*! \brief Queries hardware caps information.
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
index df2c517..6570e1d 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@@ -236,6 +236,12 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext6_t
 
 } vx_nn_convolution_relu_pooling_params_ext6_t, * vx_nn_convolution_relu_pooling_params_ext6;;
 
+typedef struct _vx_nn_convolution_relu_pooling_params_ext7_t
+{
+    vx_nn_convolution_relu_pooling_params_ext6_t ext6;  /*!< \brief convolution relu pooling params <tt>\ref vx_nn_convolution_relu_pooling_params_ext_t</tt> */
+    vx_bool       isSub;
+} vx_nn_convolution_relu_pooling_params_ext7_t, * vx_nn_convolution_relu_pooling_params_ext7;
+
 /*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
  * \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer.
  *  For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
@@ -1081,6 +1087,48 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorTableLookupLayer(
     vx_lut OutLut,
     vx_tensor output);
 
+typedef struct _vx_nn_gemm_relu_pooling_params_t
+{
+    vx_bool    enable_relu;                                 /*!< \brief  Enable Relu layer function or not. */
+    vx_bool    enable_leaky_relu;                           /*!< \brief  Enable LeakyRelu layer function or not. */
+    vx_float32 alpha;                                       /*!< \brief  Alpha value for Activation */
+    vx_float32 beta;                                        /*!< \brief  Beta value for Activation */
+    vx_uint32  node_count;                                  /*!< \brief  node count to merge */
+    vx_float32 merged_scale[MERGED_NODE_COUNT_MAX];         /*!< \brief  scale of merged node output */
+    vx_int32   merged_zero_point[MERGED_NODE_COUNT_MAX];    /*!< \brief  zero point of merged node output */
+    vx_enum    merged_data_type[MERGED_NODE_COUNT_MAX];     /*!< \brief  data type of merged node output */
+    vx_enum    act_func;                                    /*!< \brief  nn activation function */
+    vx_lut     lut_in;                                      /*!< \brief  LUT in */
+    vx_lut     lut_out;                                     /*!< \brief  LUT out */
+    vx_bool    enbale_const_multiplier;                     /*!< \brief  tensor mul with one of inputs as a single pixel const tensor */
+    vx_float32 const_multiplier;                            /*!< \brief  const multiplier */
+} vx_nn_gemm_relu_pooling_params_t, * vx_nn_gemm_relu_pooling_params;
+
+/*! \brief Create a batch gemm node, the calcution formula is output = matrix_a * matrix_b + matrix_c.
+ * \param [in] graph The reference to the graph.
+ * \param [in] matrix_a The first input tensor.
+ * \param [in] matrix_b The second input tensor. Must be in the same data type and batch count as first input tensor.
+ * \param [in] matrix_c The third input tensor. Must be in the same data type and batch count as first input tensor. [optional]
+ * \param [in] trans_a If true, the matrix_a has been transposed before calcution.
+ * \param [in] trans_b If true, the matrix_b has been transposed before calcution.
+ * \param [in] trans_c If true, the matrix_c has been transposed before calcution. [optional]
+ * \param [in] merge_param the parameters for gemm + op merging
+ * \param [out] output The output tensor. Output dimension must agree the formula in the description.
+ * \return <tt>\ref vx_node</tt>.
+ * \retval vx_node A node reference. Any possible errors preventing a successful creation
+ * should be checked using <tt>\ref vxGetStatus</tt>
+ * \ingroup group_vision_function_gemm
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmReluPoolingLayer(vx_graph graph,
+                                                             vx_tensor matrix_a,
+                                                             vx_tensor matrix_b,
+                                                             vx_tensor matrix_c,
+                                                             vx_scalar trans_a,
+                                                             vx_scalar trans_b,
+                                                             vx_scalar trans_c,
+                                                             const vx_nn_gemm_relu_pooling_params merge_param,
+                                                             vx_tensor output);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
index 867b8ce..36df374 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
@@ -165,6 +165,7 @@ typedef enum _vx_sp_attribute_e
     VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL,
     VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE,
     VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_OP_SELECT,
 
     VX_SP_ATTRIBUTE_NUM_OF_ELEMENTS_PER_LOOP_PER_INPUT,
 
@@ -181,6 +182,18 @@ typedef enum _vx_sp_attribute_e
     VX_SP_ATTRIBUTE_CONST3,     /* NN clamp max          */
     VX_SP_ATTRIBUTE_CONST4,     /* NN clmap min          */
 
+    VX_SP_ATTRIBUTE_CONST_COUNT,
+
+    VX_SP_ATTRIBUTE_SPLIT_AXIS,
+    VX_SP_ATTRIBUTE_SPLIT_MAX_SIZE,
+    VX_SP_ATTRIBUTE_SPLIT_TILEX_EQUAL_INIMAGEX,
+
+    VX_SP_ATTRIBUTE_NOT_MERGE_CONVSP,
+    VX_SP_ATTRIBUTE_UPDATE_CONST0_TO_PCQ_COEF_TENSOR,
+    VX_SP_ATTRIBUTE_RESHAPE_ARRAY, /* bit layout | output:24-29 | input3:18-23 | input2:12-17 | input1:6-11 | input0:0-5 | */
+    VX_SP_ATTRIBUTE_ALIGN_SP_CORE_AXIS,
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE,
+
     VX_SP_ATTRIBUTE_TOTAL_COUNT,
 }
 vx_sp_attribute_e;
@@ -274,9 +287,55 @@ typedef enum _vx_sp_attribute_sum_engine_2d_accum_storage_e
 }
 vx_sp_attribute_sum_engine_2d_accum_storage_e;
 
+typedef enum _vx_sp_attribute_sum_engine_op_select_e
+{
+    VX_SP_ATTRIBUTE_SUM_ENGINE_SUM_OP,
+    VX_SP_ATTRIBUTE_SUM_ENGINE_MAX_OP
+} vx_sp_attribute_sum_engine_op_select_e;
+
+typedef enum _vx_sp_attribute_reshape_e
+{
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2CHW = 0x00,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2WHC = 0x06,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2WCH = 0x09,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2HWC = 0x12,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2HCW = 0x18,
+    VX_SP_ATTRIBUTE_RESHAPE_CHW2CWH = 0x21,
+}
+vx_sp_attribute_reshape_e;
+
+typedef enum _vx_sp_attribute_split_axis_e
+{
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_X,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_Y,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_Z,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_XY,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_YZ,
+    VX_SP_ATTRIBUTE_SPLIT_ON_AXIS_XYZ,
+}
+vx_sp_attribute_split_axis_e;
+
+typedef enum _vx_sp_attribute_tile_align_sp_core_e
+{
+    VX_SP_ATTRIBUTE_TILE_ALIGN_SP_CORE_NONE = 0,
+    VX_SP_ATTRIBUTE_TILE_ALIGN_SP_CORE_WITH_AXIS_X,
+    VX_SP_ATTRIBUTE_TILE_ALIGN_SP_CORE_WITH_AXIS_Y,
+    VX_SP_ATTRIBUTE_TILE_ALIGN_SP_CORE_WITH_AXIS_XY,
+}
+vx_sp_attribute_tile_align_sp_core_e;
+
+typedef enum _vx_sp_attribute_keep_tile_size_e
+{
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE_NONE = 0,
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE_WITH_AXIS_X,
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE_WITH_AXIS_Y,
+    VX_SP_ATTRIBUTE_KEEP_TILE_SIZE_WITH_AXIS_XY,
+}
+vx_sp_attribute_keep_tile_size_e;
+
 /**********************************************************************************************/
 
-/*! \brief Creates an opaque reference to a spinst data.
+/*! \brief Creates an external reference to a spinst data.
  * \param [in] context The reference to the implementation context.
  * \return A spinst data reference.
  * \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
@@ -286,7 +345,17 @@ VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST(
     vx_context          context
     );
 
-/*! \brief Releases a reference to a spinst object.
+/*! \brief Creates an internal reference to a spinst data.
+ * \param [in] context The reference to the implementation context.
+ * \return A spinst data reference.
+ * \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINSTInternal(
+    vx_context          context
+    );
+
+/*! \brief Releases a reference to a external spinst object.
  * The object may not be garbage collected until its total reference count is zero.
  * \param [in] spinst_obj The pointer to the spinst data to release.
  * \post After returning from this function the reference is zeroed.
@@ -299,6 +368,19 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST(
     vx_spinst            *spinst_obj
     );
 
+/*! \brief Releases a reference to a internal spinst object.
+ * The object may not be garbage collected until its total reference count is zero.
+ * \param [in] spinst_obj The pointer to the spinst data to release.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors; all other values indicate failure
+ * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
+ * \ingroup group_object_spinst
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINSTInternal(
+    vx_spinst            *spinst_obj
+    );
+
 /*! \brief Add a instruction to spinst object.
  * \param [in] spinst_obj The reference to the spinst object.
  * \param [in] inst_unit_array The units of one instruction. Use a <tt>\ref vx_spinst_unit_param</tt>.
@@ -332,6 +414,12 @@ VX_API_ENTRY vx_status VX_API_CALL vxSetAttributeToSPINST(
     vx_uint32          value
     );
 
+VX_API_ENTRY vx_status VX_API_CALL vxGetAttributeToSPINST(
+    vx_spinst          spinst_obj,
+    vx_enum            attribute,
+    vx_uint32* value
+);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
index 6ce6f8a..6f75ea9 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@@ -539,6 +539,15 @@ typedef vx_enum vx_action;
  */
 typedef vx_action (VX_CALLBACK *vx_nodecomplete_f)(vx_node node);
 
+/*! \brief A callback to the client for querying information of a node.
+ * \see vx_action
+ * \see vxAssignNodeCallback
+ * \param [in] node The node to which the callback was attached.
+ * \return An action code from <tt>\ref vx_action_e</tt>.
+ * \ingroup group_node_callback
+ */
+typedef vx_status (VX_CALLBACK *vx_nodequery_f)(vx_node node);
+
 /*! \brief Vendor IDs are 2 nibbles in size and are located in the upper byte of
  * the 4 bytes of an enumeration.
  * \ingroup group_basic_features
@@ -1028,6 +1037,11 @@ enum vx_node_attribute_e {
 
     VX_NODE_ATTRIBUTE_FOR_HW_QUALITY     = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xA,
 
+    VX_NODE_SWTILING_TILE_XY                   = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x10,
+    VX_NODE_SPINST_INDEX                       = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x11,
+    VX_NODE_SPCONV_PCQ_REPLACE_SPINST          = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x12,
+    VX_NODE_SP_NAME                            = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x13,
+    VX_NODE_SPINST                             = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x14,
 };
 
 /*! \brief The parameter attributes list
diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
index adc7bf9..0e20368 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
index 526ed39..9c88390 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
index 7f7cd1e..96a5ab4 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
index f699122..06525da 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
index c03c624..1566bab 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
index f709ed4..71f3384 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
index 2339562..9b7e0ca 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
index ebab842..1bafe16 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ
diff --git a/src/tim/CMakeLists.txt b/src/tim/CMakeLists.txt
index 1172297..d0a7eb9 100644
--- a/src/tim/CMakeLists.txt
+++ b/src/tim/CMakeLists.txt
@@ -31,6 +31,7 @@ if(${TIM_VX_USE_EXTERNAL_OVXLIB})
     set(OVXLIB_INCLUDE_DIR ${OVXLIB_INC})
 else()
     set(OVXLIB_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/src/tim/vx/internal/include")
+    list(APPEND OVXLIB_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/src/tim/vx/internal/src")
 endif()
 message(STATUS "OVXLIB include directory: ${OVXLIB_INCLUDE_DIR}")
 
diff --git a/src/tim/vx/internal/BUILD b/src/tim/vx/internal/BUILD
index 392f1ec..f41b1cd 100644
--- a/src/tim/vx/internal/BUILD
+++ b/src/tim/vx/internal/BUILD
@@ -69,7 +69,6 @@ filegroup(
         "src/custom/ops/*.c",
         "src/custom/ops/kernel/evis/*.c",
         "src/custom/ops/kernel/cl/*.c",
-        "src/custom/ops/kernel/cpu/*.c",
     ])
 )
 
@@ -84,6 +83,7 @@ cc_library(
     linkstatic = True,
     includes = [
         "include",
+        "src",
     ],
     hdrs = [
         "include/vsi_nn_pub.h",
@@ -104,6 +104,7 @@ cc_library(
         "include/vsi_nn_compatibility.h",
         "include/vsi_nn_assert.h",
         "include/vsi_nn_feature.h",
+        "include/vsi_nn_post.h",
         "include/vsi_nn_rnn.h",
         "include/vsi_nn_rnn_helper.h",
         "include/vsi_nn_rnn_prv.h",
@@ -121,13 +122,15 @@ cc_library(
         "include/utils/vsi_nn_limits.h",
         "include/utils/vsi_nn_dtype_util.h",
         "include/utils/vsi_nn_dtype_util_prv.h",
-        "include/utils/vsi_nn_vdata.h",
         "include/utils/vsi_nn_tensor_op.h",
+        "include/utils/vsi_nn_dlfcn.h",
         "include/utils/vsi_nn_shape_util.h",
         "include/utils/vsi_nn_constraint_check.h",
         "include/quantization/vsi_nn_asymmetric_affine.h",
         "include/quantization/vsi_nn_dynamic_fixed_point.h",
         "include/quantization/vsi_nn_perchannel_symmetric_affine.h",
+        "include/post/vsi_nn_post_fasterrcnn.h",
+        "include/post/vsi_nn_post_cmupose.h",
         "include/interface/ops.def",
         "include/kernel/vsi_nn_kernel.h",
         "include/kernel/vsi_nn_gpu.h",
@@ -168,6 +171,9 @@ cc_library(
         "src/vsi_nn_daemon.c",
         "src/vsi_nn_graph_optimization.c",
         "src/vsi_nn_pre_post_process.c",
+        "src/vsi_nn_tensor_util_prv.h",
+        "src/vsi_nn_types_prv.h",
+        "src/vsi_nn_kernel_prv.h",
         "src/utils/vsi_nn_link_list.c",
         "src/utils/vsi_nn_util.c",
         "src/utils/vsi_nn_math.c",
@@ -177,14 +183,16 @@ cc_library(
         "src/utils/vsi_nn_hashmap.c",
         "src/utils/vsi_nn_limits.c",
         "src/utils/vsi_nn_dtype_util.c",
-        "src/utils/vsi_nn_vdata.c",
         "src/utils/vsi_nn_tensor_op.c",
+        "src/utils/vsi_nn_dlfcn.c",
         "src/utils/vsi_nn_shape_util.c",
         "src/utils/vsi_nn_dtype.c",
         "src/utils/vsi_nn_constraint_check.c",
         "src/quantization/vsi_nn_asymmetric_affine.c",
         "src/quantization/vsi_nn_dynamic_fixed_point.c",
         "src/quantization/vsi_nn_perchannel_symmetric_affine.c",
+        "src/post/vsi_nn_post_fasterrcnn.c",
+        "src/post/vsi_nn_post_cmupose.c",
         "src/kernel/vsi_nn_kernel.c",
         "src/kernel/vsi_nn_kernel_util.c",
         "src/kernel/vsi_nn_kernel_backend.c",
@@ -202,4 +210,3 @@ cc_library(
       + [":custom_srcs"],
     deps = ["//prebuilt-sdk:VIV_SDK_LIB"]
 )
-
diff --git a/src/tim/vx/internal/include/custom/custom_node_type.def b/src/tim/vx/internal/include/custom/custom_node_type.def
index 0283c71..90d7727 100644
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@@ -5,3 +5,4 @@ DEF_NODE_TYPE(custom_softmax)
 DEF_NODE_TYPE(custom_ainr_denoise_postprocess)
 DEF_NODE_TYPE(custom_warp_affine)
 DEF_NODE_TYPE(custom_warp_perspective)
+DEF_NODE_TYPE(custom_sample)
diff --git a/src/tim/vx/internal/include/custom/custom_ops.def b/src/tim/vx/internal/include/custom/custom_ops.def
index 690b057..0050439 100644
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@@ -5,3 +5,4 @@ DEF_OP(CUSTOM_SOFTMAX)
 DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS)
 DEF_OP(CUSTOM_WARP_AFFINE)
 DEF_OP(CUSTOM_WARP_PERSPECTIVE)
+DEF_OP(CUSTOM_SAMPLE)
diff --git a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_sample.h b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_sample.h
new file mode 100644
index 0000000..d15fa0b
--- /dev/null
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_sample.h
@@ -0,0 +1,35 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_CUSTOM_SAMPLE_H
+#define _VSI_NN_OP_CUSTOM_SAMPLE_H
+
+#include "vsi_nn_platform.h"
+#include "vsi_nn_types.h"
+
+typedef struct _vsi_nn_custom_sample_param
+{
+    int32_t axis;
+} vsi_nn_custom_sample_param;
+
+#endif
diff --git a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
index 1a05c8a..8976be3 100644
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@@ -30,5 +30,6 @@
 #include "custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h"
 #include "custom/ops/vsi_nn_op_custom_warp_affine.h"
 #include "custom/ops/vsi_nn_op_custom_warp_perspective.h"
+#include "custom/ops/vsi_nn_op_custom_sample.h"
 
 #endif
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
old mode 100644
new mode 100755
index 045eb95..82d843f
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -183,3 +183,13 @@ DEF_OP(LPPOOL)
 DEF_OP(SCATTER_ELEMENTS)
 DEF_OP(PRE_PROCESS_YUV422)
 DEF_OP(BUCKETIZE)
+DEF_OP(GLOBALLPPOOL)
+DEF_OP(AVG_POOL3D)
+DEF_OP(ATAN)
+DEF_OP(ATANH)
+DEF_OP(ACOSH)
+DEF_OP(MAXUNPOOL)
+DEF_OP(REVERSESEQUENCE)
+DEF_OP(INVERSE_SIGMOID)
+DEF_OP(GRID_SAMPLE)
+DEF_OP(LPNORM)
diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def
old mode 100644
new mode 100755
index a47559a..de33327
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@@ -20,3 +20,4 @@ DEF_OP(SPACE2DEPTH_INTERNAL)
 DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
 DEF_OP(GRUCELL_ACTIVATION_Z_H)
 DEF_OP(REDUCE_MEAN_INTERNAL)
+DEF_OP(BILINEAR_GRID_SAMPLE)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_gpu.h b/src/tim/vx/internal/include/kernel/vsi_nn_gpu.h
index 3dc44d5..c943343 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_gpu.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_gpu.h
@@ -59,7 +59,7 @@ typedef struct
     gpu_dp_type_e type;
 } gpu_dp_inst_t;
 
-typedef struct
+typedef struct VSI_PUBLIC_TYPE
 {
     uint32_t dim;
     size_t   global_offset[GPU_MAX_DIMENSION_SIZE];
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
index d2c4e58..c118e13 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@@ -51,7 +51,7 @@ typedef enum
     VSI_NN_KERNEL_TYPE_SP,
     VSI_NN_KERNEL_TYPE_NUM,
     VSI_NN_KERNEL_TYPE_NONE = VSI_NN_KERNEL_TYPE_NUM
-} vsi_nn_kernel_type_e;
+} VSI_PUBLIC_TYPE  vsi_nn_kernel_type_e;
 
 /** Kernel pirority */
 enum
@@ -79,7 +79,7 @@ typedef enum
     BOOL8,
     I4,
     U4,
-} vsi_nn_kernel_dtype_e;
+} VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e;
 
 typedef enum
 {
@@ -98,7 +98,7 @@ typedef enum
     VSI_NN_GPU_SOURCE_FMT_CODE = 0,
     VSI_NN_GPU_SOURCE_FMT_EXECUTABLE = 1,
     VSI_NN_GPU_SOURCE_FMT_NUM
-} vsi_nn_gpu_source_fmt_e;
+} VSI_PUBLIC_TYPE vsi_nn_gpu_source_fmt_e;
 
 typedef char * vsi_nn_kernel_source_t;
 typedef uint32_t vsi_nn_kernel_unique_id_t;
@@ -125,7 +125,7 @@ typedef struct
         vsi_nn_kernel_source_info_t sources[VSI_NN_GPU_SOURCE_FMT_NUM];
         vsi_nn_gpu_source_fmt_e active_source_fmt;
     } gpu;
-} vsi_nn_kernel_t;
+} VSI_PUBLIC_TYPE vsi_nn_kernel_t;
 
 typedef struct
 {
@@ -172,15 +172,15 @@ typedef struct
     int32_t allow_kernel_num;
 } vsi_nn_kernel_selector_t;
 
-typedef void * vsi_nn_kernel_node_param_t;
+typedef void * VSI_PUBLIC_TYPE vsi_nn_kernel_node_param_t;
 
 typedef void * vsi_nn_kernel_tensor_t;
 
-typedef void * vsi_nn_kernel_node_t;
+typedef void * VSI_PUBLIC_TYPE vsi_nn_kernel_node_t;
 
 typedef void * vsi_nn_kernel_graph_t;
 
-typedef void * vsi_nn_kernel_scalar_t;
+typedef void * VSI_PUBLIC_TYPE vsi_nn_kernel_scalar_t;
 
 typedef vsi_nn_hashmap_t vsi_nn_kernel_param_t;
 
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
index f413b81..8b8c055 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@@ -51,6 +51,10 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
     VSI_NN_KERNEL_LUT_LINEAR_EXP       = 17,
     VSI_NN_KERNEL_LUT_LINEAR_RSQRT     = 18,
     VSI_NN_KERNEL_LUT_LINEAR_SIGMOID   = 19,
+    VSI_NN_KERNEL_LUT_ATAN             = 20,
+    VSI_NN_KERNEL_LUT_ATANH            = 21,
+    VSI_NN_KERNEL_LUT_ACOSH            = 22,
+    VSI_NN_KERNEL_LUT_INVERSE_SIGMOID  = 23,
 
 };
 
@@ -67,6 +71,8 @@ typedef struct _vsi_nn_kernel_lut_
 typedef struct  _vsi_nn_kernel_lut_params
 {
     vsi_enum act_type;
+    vsi_bool pwl_sign_remove_support;
+    float clamp_min;
     float params[16];
 } vsi_nn_kernel_lut_params;
 
diff --git a/src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h b/src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h
index e486949..9413ede 100644
--- a/src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h
+++ b/src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h
@@ -47,7 +47,7 @@ typedef struct vsi_nn_kernel_info
     vx_kernel_description_t ** kernel;
     uint8_t kernel_index;
     uint8_t init_index;
-} vsi_nn_kernel_info_t;
+} VSI_PUBLIC_TYPE vsi_nn_kernel_info_t;
 
 uint8_t * vsi_nn_LoadBinarySource
     (
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_argmax.h b/src/tim/vx/internal/include/ops/vsi_nn_op_argmax.h
index 2ec4172..cc9f42e 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_argmax.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_argmax.h
@@ -112,6 +112,7 @@ typedef struct _vsi_nn_argmax_param
     /* argmax layer local data structure */
     vsi_nn_argmax_lcl_data local;
     int32_t axis;
+    vsi_bool keep_dims;
 } vsi_nn_argmax_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
index e9d1b70..87ec5ec 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
@@ -111,6 +111,7 @@ typedef struct _vsi_nn_argmin_param
     /* argmin layer local data structure */
     vsi_nn_argmin_lcl_data local;
     int32_t axis;
+    vsi_bool keep_dims;
 } vsi_nn_argmin_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_avg_pool3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_avg_pool3d.h
new file mode 100644
index 0000000..c224ef1
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_avg_pool3d.h
@@ -0,0 +1,53 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_AVG_POOL3D_H
+#define _VSI_NN_OP_AVG_POOL3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_avg_pool3d_param
+{
+    /* round_type is used to calculate the output shape */
+    vsi_nn_round_type_e round_type;
+    uint32_t     ksize[3];
+    uint32_t     stride[3];
+    /* Pad left, right, top, bottom, front, end */
+    uint32_t     pad[6];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e pad_type;
+    /* Whether include pad pixels when calculating value for the edges */
+    int32_t      count_include_pad;
+} vsi_nn_avg_pool3d_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_vdata.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
similarity index 74%
rename from src/tim/vx/internal/include/utils/vsi_nn_vdata.h
rename to src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
index a0f295f..d04c589 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_vdata.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
@@ -21,36 +21,31 @@
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
-#ifndef _VSI_NN_VDATA_H
-#define _VSI_NN_VDATA_H
 
-#include <stdio.h>
-#include <stdint.h>
+#ifndef _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H
+#define _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H
 
-#include "vsi_nn_graph.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_tensor.h"
+#include "vsi_nn_types.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-OVXLIB_API uint8_t * vsi_nn_VdataCreate
-    (
-    vsi_nn_graph_t * graph,
-    vsi_nn_node_t  * node,
-    uint32_t      * p_stream_size
-    );
 
-OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateVDataTensor
-    (
-    vsi_nn_graph_t       * graph,
-    uint8_t             * stream,
-    vsi_nn_tensor_attr_t * attr
-    );
+typedef struct _vsi_nn_bilinear_grid_sample_param
+{
+    struct _bilinear_grid_sample_local_data_t* local;
+    vsi_bool align_corners;
+    vsi_nn_pad_mode_e padding_mode;
+    int32_t const_val;
+} vsi_nn_bilinear_grid_sample_param;
+
+_compiler_assert(offsetof(vsi_nn_bilinear_grid_sample_param, local) == 0, \
+    vsi_nn_bilinear_grid_sample_h );
 
 #ifdef __cplusplus
 }
 #endif
 
 #endif
+
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_vdata.c b/src/tim/vx/internal/include/ops/vsi_nn_op_gather_nd.h
similarity index 68%
rename from src/tim/vx/internal/src/utils/vsi_nn_vdata.c
rename to src/tim/vx/internal/include/ops/vsi_nn_op_gather_nd.h
index c3171b6..06e6599 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_vdata.c
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gather_nd.h
@@ -21,34 +21,23 @@
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
+#ifndef _VSI_NN_OP_GATHER_ND_H
+#define _VSI_NN_OP_GATHER_ND_H
 
-#include "vsi_nn_graph.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_ops.h"
-#include "vsi_nn_log.h"
-#include "utils/vsi_nn_util.h"
+#include "vsi_nn_types.h"
 
-uint8_t * vsi_nn_VdataCreate
-    (
-    vsi_nn_graph_t * graph,
-    vsi_nn_node_t  * node,
-    uint32_t      * p_stream_size
-    )
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_gather_nd_param
 {
-    return NULL;
-} /* vsi_nn_VdataCreate() */
+    int32_t     batch_dims;
+} vsi_nn_gather_nd_param;
 
-vsi_nn_tensor_t * vsi_nn_CreateVDataTensor
-    (
-    vsi_nn_graph_t       * graph,
-    uint8_t             * stream,
-    vsi_nn_tensor_attr_t * attr
-    )
-{
-    return NULL;
-} /* vsi_nn_CreateVDataTensor() */
+#ifdef __cplusplus
+}
+#endif
+
+#endif
 
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_globallppool.h b/src/tim/vx/internal/include/ops/vsi_nn_op_globallppool.h
new file mode 100644
index 0000000..66e0f5b
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_globallppool.h
@@ -0,0 +1,44 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GLOBALLPPOOL_H
+#define _VSI_NN_OP_GLOBALLPPOOL_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_globallppool_param
+{
+    int32_t p;
+} vsi_nn_globallppool_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grid_sample.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grid_sample.h
new file mode 100644
index 0000000..0a67e2b
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grid_sample.h
@@ -0,0 +1,58 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GRID_SAMPLE_H
+#define _VSI_NN_OP_GRID_SAMPLE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//typedef uint32_t vsi_nn_grid_sample_mode_t;
+//enum { bilinear = 0, nearest };
+//
+//typedef uint32_t vsi_nn_grid_sample_padding_mode_t;
+//enum { zeros = 0, CONST };
+
+typedef struct _grid_sample_local_data_t {
+    int32_t placeholder;
+} grid_sample_local_data_t;
+
+typedef struct _vsi_nn_grid_sample_param
+{
+    grid_sample_local_data_t* local;
+    vsi_enum mode;
+    vsi_bool align_corners;
+    vsi_nn_pad_mode_e padding_mode;
+    int32_t const_val;
+} vsi_nn_grid_sample_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_imageprocess.h b/src/tim/vx/internal/include/ops/vsi_nn_op_imageprocess.h
index 7b68724..43c219d 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_imageprocess.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_imageprocess.h
@@ -67,7 +67,7 @@ typedef struct _vsi_nn_imageprocess_param
         int32_t mean_value_size;
         float* mean_value;
     } mean;
-} vsi_nn_imageprocess_param;
+} VSI_PUBLIC_TYPE vsi_nn_imageprocess_param;
 
 /**
 * Insert imageprocess op for image pre process
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_inverse_sigmoid.h b/src/tim/vx/internal/include/ops/vsi_nn_op_inverse_sigmoid.h
new file mode 100644
index 0000000..fa7093e
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_inverse_sigmoid.h
@@ -0,0 +1,45 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_INVERSE_SIGMOID_H
+#define _VSI_NN_OP_INVERSE_SIGMOID_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_inverse_sigmoid_param
+{
+    // Add parameters here
+    float eps;
+} vsi_nn_inverse_sigmoid_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lpnorm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lpnorm.h
new file mode 100644
index 0000000..47d1792
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lpnorm.h
@@ -0,0 +1,45 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_LPNORM_H
+#define _VSI_NN_OP_LPNORM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_lpnorm_param
+{
+    int axis;
+    int p;
+} vsi_nn_lpnorm_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_maxunpool.h b/src/tim/vx/internal/include/ops/vsi_nn_op_maxunpool.h
new file mode 100644
index 0000000..4943840
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_maxunpool.h
@@ -0,0 +1,48 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_MAXUNPOOL_H
+#define _VSI_NN_OP_MAXUNPOOL_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_maxunpool_param
+{
+    // Add parameters here
+    uint32_t ksize[2];
+    uint32_t pad[4];
+    uint32_t stride[2];
+    const uint32_t *output_size;
+} vsi_nn_maxunpool_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
index dddee8d..aa8fc82 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
@@ -68,6 +68,8 @@ typedef struct _vsi_nn_pre_process_nv12_param
     vsi_bool reverse_channel;
 
     vsi_nn_pre_process_nv12_lcl_data* local;
+
+    vsi_nn_nv_type nv_type;
 } vsi_nn_pre_process_nv12_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
index 20eb56c..8c2fea6 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduce_mean_internal.h
@@ -38,6 +38,7 @@ typedef struct _vsi_nn_reduce_mean_internal_param
     vx_int32    *axis;
     vx_uint32   axis_num;
     float       scale;
+    vsi_enum     type;
 } vsi_nn_reduce_mean_internal_param;
 _compiler_assert(offsetof(vsi_nn_reduce_mean_internal_param, local) == 0, \
     vsi_nn_reduce_mean_internal_h );
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reversesequence.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reversesequence.h
new file mode 100644
index 0000000..0b5a496
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reversesequence.h
@@ -0,0 +1,45 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_REVERSESEQUENCE_H
+#define _VSI_NN_OP_REVERSESEQUENCE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_reversesequence_param
+{
+    int32_t batch_axis;
+    int32_t time_axis;
+} vsi_nn_reversesequence_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
index e24f043..20add49 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
@@ -38,6 +38,7 @@ typedef struct _vsi_nn_roi_align_param
     float width_ratio;
     int32_t height_sample_num;
     int32_t width_sample_num;
+    vsi_nn_roi_align_type_e platform_type;
 } vsi_nn_roi_align_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h b/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h
index d7bb3c7..583777d 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h
@@ -71,6 +71,7 @@ typedef struct _vsi_nn_strided_slice_lcl_data2
 
     vsi_bool is_dataconvert_op;
     vsi_bool is_optimized;
+    vsi_bool is_same_shape;
 
     strided_slice_param params;
 } vsi_nn_strided_slice_lcl_data2;
diff --git a/src/tim/vx/internal/include/post/vsi_nn_post_cmupose.h b/src/tim/vx/internal/include/post/vsi_nn_post_cmupose.h
new file mode 100644
index 0000000..eb74f09
--- /dev/null
+++ b/src/tim/vx/internal/include/post/vsi_nn_post_cmupose.h
@@ -0,0 +1,163 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_POST_CMUPOSE_H_
+#define _VSI_NN_POST_CMUPOSE_H_
+
+#include "utils/vsi_nn_link_list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_subset_data_t
+{
+    float idx[20];
+}vsi_nn_subset_data_t;
+
+typedef struct _vsi_nn_subset_t
+{
+    vsi_nn_link_list_t link_list;
+    vsi_nn_subset_data_t data;
+} VSI_PUBLIC_TYPE vsi_nn_subset_t;
+
+typedef struct _vsi_nn_peaks_data_t
+{
+    uint32_t location[2];
+    float score;
+    uint32_t id;
+} VSI_PUBLIC_TYPE vsi_nn_peaks_data_t;
+
+typedef struct _vsi_nn_peaks_t
+{
+    vsi_nn_link_list_t link_list;
+    vsi_nn_peaks_data_t peak;
+} VSI_PUBLIC_TYPE vsi_nn_peaks_t;
+
+typedef struct _vsi_nn_conncection_data_t
+{
+    uint32_t x;
+    uint32_t y;
+    float score;
+    uint32_t i;
+    uint32_t j;
+}vsi_nn_connection_data_t;
+
+typedef struct _vsi_nn_connection_t
+{
+    vsi_nn_link_list_t link_list;
+    vsi_nn_connection_data_t data;
+}vsi_nn_connection_t;
+
+typedef struct _vsi_nn_con_candidate_data_t
+{
+    uint32_t i;
+    uint32_t j;
+    float score;
+    float candAB;
+}vsi_nn_con_candidate_data_t;
+
+typedef struct _vsi_nn_con_candidate_t
+{
+    vsi_nn_link_list_t link_list;
+    vsi_nn_con_candidate_data_t data;
+}vsi_nn_con_candidate_t;
+
+typedef struct _vsi_nn_cmupose_multiplier_t
+{
+    float *size;
+    uint32_t num;
+}vsi_nn_cmupose_multiplier_t;
+
+typedef struct _vsi_nn_cmupose_image_t
+{
+    uint32_t width;
+    uint32_t height;
+    uint32_t channel;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_image_t;
+
+typedef struct _vsi_nn_cmupose_scale_search_t
+{
+    float *size;
+    uint32_t num;
+}vsi_nn_cmupose_scale_search_t;
+
+typedef struct _vsi_nn_cmupose_model_t
+{
+    uint32_t boxsize;
+    uint32_t stride;
+    uint32_t padValue;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_model_t;
+
+typedef struct _vsi_nn_cmupose_param_t
+{
+    float thre1;
+    float thre2;
+    float thre3;
+    uint32_t mid_num;
+    vsi_nn_cmupose_scale_search_t scale_search;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_param_t;
+
+typedef struct _vsi_nn_cmupose_inputs_t
+{
+    vsi_nn_tensor_t *net_out;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_inputs_t;
+
+typedef struct _vsi_nn_cmupose_config_t
+{
+    vsi_nn_cmupose_inputs_t inputs;
+    vsi_nn_cmupose_param_t  param;
+    vsi_nn_cmupose_model_t  model;
+    vsi_nn_cmupose_image_t  image;
+} VSI_PUBLIC_TYPE vsi_nn_cmupose_config_t;
+
+OVXLIB_API vsi_status vsi_nn_CMUPose_Post_Process
+    (
+    float *net_out,
+    vsi_nn_cmupose_config_t *config,
+    vsi_nn_peaks_t ***all_peaks_out,
+    uint32_t *all_peaks_num_out,
+    vsi_nn_subset_t **subset_list_out,
+    vsi_nn_peaks_data_t **peak_candidate_out,
+    uint32_t *peak_candidate_num_out
+    );
+
+OVXLIB_API vsi_status vsi_nn_CMUPose_PostProcess
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_cmupose_inputs_t *inputs,
+    vsi_nn_cmupose_image_t *image,
+    vsi_nn_cmupose_param_t *param,
+    vsi_nn_cmupose_model_t *model,
+    vsi_nn_peaks_t ***all_peaks,
+    uint32_t *all_peaks_num,
+    vsi_nn_peaks_data_t **candidate,
+    uint32_t *candidate_num,
+    vsi_nn_subset_t **subset
+    );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/post/vsi_nn_post_fasterrcnn.h b/src/tim/vx/internal/include/post/vsi_nn_post_fasterrcnn.h
new file mode 100644
index 0000000..8a7680d
--- /dev/null
+++ b/src/tim/vx/internal/include/post/vsi_nn_post_fasterrcnn.h
@@ -0,0 +1,79 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_POST_FASTERRCNN_H_
+#define _VSI_NN_POST_FASTERRCNN_H_
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_node_type.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_link_list.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_fasterrcnn_box_t
+{
+    vsi_nn_link_list_t link_list;
+
+    /* upper-left coordinate(x1,y1) */
+    float x1;
+    float y1;
+    /* lower-right coordinate(x2,y2) */
+    float x2;
+    float y2;
+    float score;
+    uint32_t class_id;
+} VSI_PUBLIC_TYPE vsi_nn_fasterrcnn_box_t;
+
+typedef struct _vsi_nn_fasterrcnn_param_t
+{
+    float conf_thresh;
+    float nms_thresh;
+    const char **classes;
+    uint32_t classes_num;
+    uint32_t rois_num;
+    vsi_nn_proposal_im_info iminfo;
+} VSI_PUBLIC_TYPE vsi_nn_fasterrcnn_param_t;
+
+typedef struct _vsi_nn_fasterrcnn_inputs_t
+{
+    vsi_nn_tensor_t *rois;
+    vsi_nn_tensor_t *cls;
+    vsi_nn_tensor_t *bbox;
+} VSI_PUBLIC_TYPE vsi_nn_fasterrcnn_inputs_t;
+
+OVXLIB_API vsi_status vsi_nn_FasterRCNN_PostProcess
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_fasterrcnn_inputs_t *inputs,
+    vsi_nn_fasterrcnn_param_t *param,
+    vsi_nn_fasterrcnn_box_t **dets_box
+    );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_binary_tree.h b/src/tim/vx/internal/include/utils/vsi_nn_binary_tree.h
index 186f381..41ba068 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_binary_tree.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_binary_tree.h
@@ -29,8 +29,9 @@ extern "C"{
 #endif
 
 #include <stdint.h>
+#include "vsi_nn_feature_config.h"
 
-typedef int64_t vsi_nn_binary_tree_key_t;
+typedef int64_t VSI_PUBLIC_TYPE vsi_nn_binary_tree_key_t;
 
 #define vsi_nn_BinaryTreeInitRoot(n) do{n = NULL;} while (0);
 
@@ -40,7 +41,7 @@ typedef struct _vsi_nn_binary_tree
     struct _vsi_nn_binary_tree * right;
     vsi_nn_binary_tree_key_t     key;
     void * data_ptr;
-} vsi_nn_binary_tree_t;
+} VSI_PUBLIC_TYPE vsi_nn_binary_tree_t;
 
 OVXLIB_API void vsi_nn_BinaryTreeRemoveNode
     (
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dlfcn.h b/src/tim/vx/internal/include/utils/vsi_nn_dlfcn.h
new file mode 100644
index 0000000..82baf12
--- /dev/null
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dlfcn.h
@@ -0,0 +1,65 @@
+#ifndef __VSI_NN_DLFCN_H
+#define __VSI_NN_DLFCN_H
+
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
+#define  RTLD_LAZY   0
+#define  RTLD_NOW    0
+
+#define  RTLD_GLOBAL (1 << 1)
+#define  RTLD_LOCAL  (1 << 2)
+
+#define  RTLD_DEFAULT    ((void *)0)
+#define  RTLD_NEXT       ((void *)-1)
+
+#else
+#include <dlfcn.h>
+#endif
+
+/**
+ * Opend a shared library
+ *
+ * @param[in] Library path
+ * @param[in] Opend mode.
+ *
+ * @return Library handle on success, or NULL otherwise.
+ */
+void* vsi_nn_dlopen
+    (
+    const char *file,
+    int mode
+    );
+
+/**
+ * Close the opened library
+ *
+ * @param[in] Library handler
+ *
+ * @return TRUE on success
+ */
+int vsi_nn_dlclose
+    (
+    void *handle
+    );
+
+/**
+ * Find symbol from opened library
+ *
+ * @param[in] Library handler
+ * @param[in] Symbol name to find.
+ *
+ * @return Symbol
+ */
+void* vsi_nn_dlsym
+    (
+    void *handle,
+    const char *name
+    );
+
+/**
+ * Get error info.
+ *
+ * @return Error message.
+ */
+char * vsi_nn_dlerror(void);
+#endif
+
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
index 7eaec28..ab63a3c 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@@ -464,6 +464,7 @@ static VSI_INLINE_API vsi_status dtype_to_float32
     case VSI_NN_TYPE_BOOL8:
     case VSI_NN_TYPE_UINT8:
     case VSI_NN_TYPE_INT16:
+    case VSI_NN_TYPE_UINT16:
     case VSI_NN_TYPE_INT32:
         {
             int32_t src_value = 0;
@@ -516,6 +517,7 @@ static VSI_INLINE_API vsi_status float32_to_dtype
     case VSI_NN_TYPE_BOOL8:
     case VSI_NN_TYPE_UINT8:
     case VSI_NN_TYPE_INT16:
+    case VSI_NN_TYPE_UINT16:
     case VSI_NN_TYPE_INT32:
     case VSI_NN_TYPE_UINT32:
         {
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
index e16d9e8..7e6afb2 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
@@ -36,7 +36,7 @@ typedef struct _vsi_nn_link_list
 {
     struct _vsi_nn_link_list * prev;
     struct _vsi_nn_link_list * next;
-} vsi_nn_link_list_t;
+} VSI_PUBLIC_TYPE vsi_nn_link_list_t;
 
 typedef void ( * vsi_nn_link_list_init_t )
     (
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_map.h b/src/tim/vx/internal/include/utils/vsi_nn_map.h
index 33ac22a..37754c9 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_map.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_map.h
@@ -32,7 +32,7 @@
 extern "C"{
 #endif
 
-typedef vsi_nn_binary_tree_key_t vsi_nn_map_key_t;
+typedef vsi_nn_binary_tree_key_t VSI_PUBLIC_TYPE vsi_nn_map_key_t;
 
 typedef struct _vsi_nn_map_key_list
 {
@@ -45,7 +45,7 @@ typedef struct _vsi_nn_map
     int size;
     vsi_nn_map_key_list_t * keys;
     vsi_nn_binary_tree_t  * values;
-} vsi_nn_map_t;
+} VSI_PUBLIC_TYPE vsi_nn_map_t;
 
 OVXLIB_API void vsi_nn_MapInit
     (
diff --git a/src/tim/vx/internal/include/vsi_nn_compatibility.h b/src/tim/vx/internal/include/vsi_nn_compatibility.h
index bcf2f25..4c28b94 100644
--- a/src/tim/vx/internal/include/vsi_nn_compatibility.h
+++ b/src/tim/vx/internal/include/vsi_nn_compatibility.h
@@ -99,6 +99,30 @@ typedef    enum vx_nn_activation_function_e                                vx_co
 #define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_NONE                        VX_NN_ACTIVATION_NONE
 #define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SWISH                       VX_NN_ACTIVATION_SWISH
 #define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HWISH                       VX_NN_ACTIVATION_HSWISH
+#if (VX_ACTIVATION_EXT2_SUPPORT)
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SIGN                        VX_NN_ACTIVATION_SIGN_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HSIGMOID                    VX_NN_ACTIVATION_HSIGMOID_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_NEG                         VX_NN_ACTIVATION_NEG_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_CLIP                        VX_NN_ACTIVATION_CLIP_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_EXP                         VX_NN_ACTIVATION_EXP_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SIN                         VX_NN_ACTIVATION_SIN_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_COS                         VX_NN_ACTIVATION_COS_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LOG                         VX_NN_ACTIVATION_LOG_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_MISH                        VX_NN_ACTIVATION_MISH_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_GELU                        VX_NN_ACTIVATION_GELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HGELU                       VX_NN_ACTIVATION_HGELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ELU                         VX_NN_ACTIVATION_ELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SELU                        VX_NN_ACTIVATION_SELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_CELU                        VX_NN_ACTIVATION_CELU_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RECIPROCAL                  VX_NN_ACTIVATION_RECIPROCAL_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTSIGN                    VX_NN_ACTIVATION_SOFTSIGN_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ATAN                        VX_NN_ACTIVATION_ATAN_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ATANH                       VX_NN_ACTIVATION_ATANH_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ACOSH                       VX_NN_ACTIVATION_ACOSH_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_INVERSE_SIGMOID             VX_NN_ACTIVATION_INVERSE_SIGMOID_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ROUND                       VX_NN_ACTIVATION_ROUND_VSI
+#define    VX_CONVOLUTIONAL_NETWORK_ACTIVATION_ERF                         VX_NN_ACTIVATION_ERF_VSI
+#endif
 
 /*
   keep the backward compatibility with spec 1.1 for vxCopyTensorPatch_11
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index f5ace92..75e5ab7 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -77,6 +77,7 @@ typedef struct _vsi_nn_runtime_option_t
     int32_t enable_concat_optimize;
     int32_t enable_asymi8_to_u8;
     int32_t enable_dataconvert_optimize;
+    int32_t enable_stream_processor;
 } vsi_nn_runtime_option_t;
 
 /**
@@ -87,7 +88,7 @@ typedef struct _vsi_nn_context_t
     vx_context c;
     vsi_nn_hw_config_t config;
     vsi_nn_runtime_option_t options;
-} *vsi_nn_context_t;
+} VSI_PUBLIC_TYPE *vsi_nn_context_t;
 
 /**
  * Create context
diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h
index 8906a96..01ec04c 100644
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@@ -1,7 +1,46 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the Software),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
 /*****Auto generated header file, Please DO NOT modify manually!*****/
 #ifndef _VSI_NN_FEATURE_CONFIG_H
 #define _VSI_NN_FEATURE_CONFIG_H
 
+#define VSI_PUBLIC_TYPE
+#include <VX/vx_khr_cnn.h>
+#if defined(VX_KHR_COMPATIBILITY) && (0x1==VX_KHR_COMPATIBILITY)
+#include <VX/vx_khr_compatible.h>
+#endif
+#ifndef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
 #define VSI_PERCHANNEL_QUANTIZATION_SUPPORT
+#endif
+#if defined(VX_INVALIDATE_HANDLE_SUPPORT) && VX_INVALIDATE_HANDLE_SUPPORT
+#define VSI_INVALIDATE_HANDLE_SUPPORT
+#endif
+#ifndef VSI_0_D_TENSOR_SUPPORT
+#define VSI_0_D_TENSOR_SUPPORT
+#endif
+#if defined(VX_TENSORVIEW_ON_ANY_DIM) && VX_TENSORVIEW_ON_ANY_DIM
+#define VSI_CONCAT_ENHANCE_SUPPORT
+#endif
 
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h
index c9c0687..1756870 100644
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@@ -74,7 +74,7 @@ extern "C" {
 /**
  * Graph structure
  */
-struct _vsi_nn_graph
+struct VSI_PUBLIC_TYPE _vsi_nn_graph
 {
     /** Context */
     vsi_nn_context_t   ctx;
@@ -167,6 +167,8 @@ struct _vsi_nn_graph
     } complete_signal;
 
     vsi_bool isAllowFastMode;
+
+    //DO NOT modify this sturct.
 };
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_log.h b/src/tim/vx/internal/include/vsi_nn_log.h
index d8b5bad..307f06d 100644
--- a/src/tim/vx/internal/include/vsi_nn_log.h
+++ b/src/tim/vx/internal/include/vsi_nn_log.h
@@ -46,7 +46,7 @@ typedef enum _vsi_nn_log_level_e
     VSI_NN_LOG_WARN,
     VSI_NN_LOG_INFO,
     VSI_NN_LOG_DEBUG
-}vsi_nn_log_level_e;
+} VSI_PUBLIC_TYPE vsi_nn_log_level_e;
 
 #define VSI_NN_MAX_DEBUG_BUFFER_LEN 1024
 #define VSILOGE( fmt, ... ) \
diff --git a/src/tim/vx/internal/include/vsi_nn_node.h b/src/tim/vx/internal/include/vsi_nn_node.h
index b922204..0a69dbd 100644
--- a/src/tim/vx/internal/include/vsi_nn_node.h
+++ b/src/tim/vx/internal/include/vsi_nn_node.h
@@ -58,7 +58,7 @@ typedef struct _vsi_nn_node_attr_t
 } vsi_nn_node_attr_t;
 
 /** Node structure */
-struct _vsi_nn_node
+struct VSI_PUBLIC_TYPE _vsi_nn_node
 {
     /**
      * Graph handle
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index d41e0f0..37032f4 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -200,8 +200,17 @@
 #include "ops/vsi_nn_op_scatter_elements.h"
 #include "ops/vsi_nn_op_pre_process_yuv422.h"
 #include "ops/vsi_nn_op_bucketize.h"
+#include "ops/vsi_nn_op_globallppool.h"
+#include "ops/vsi_nn_op_gather_nd.h"
+#include "ops/vsi_nn_op_avg_pool3d.h"
+#include "ops/vsi_nn_op_maxunpool.h"
+#include "ops/vsi_nn_op_reversesequence.h"
+#include "ops/vsi_nn_op_grid_sample.h"
+#include "ops/vsi_nn_op_bilinear_grid_sample.h"
+#include "ops/vsi_nn_op_lpnorm.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
+#include "ops/vsi_nn_op_inverse_sigmoid.h"
 
 #if defined(__cplusplus)
 extern "C"{
@@ -386,6 +395,15 @@ typedef union _vsi_nn_nn_param
     vsi_nn_scatter_elements_param   scatter_elements;
     vsi_nn_pre_process_yuv422_param pre_process_yuv422;
     vsi_nn_bucketize_param          bucketize;
+    vsi_nn_globallppool_param       globallppool;
+    vsi_nn_gather_nd_param          gather_nd;
+    vsi_nn_avg_pool3d_param         avg_pool3d;
+    vsi_nn_maxunpool_param          maxunpool;
+    vsi_nn_reversesequence_param    reversesequence;
+    vsi_nn_inverse_sigmoid_param       inverse_sigmoid;
+    vsi_nn_grid_sample_param        gridsample;
+    vsi_nn_bilinear_grid_sample_param bilinear_grid_sample;
+    vsi_nn_lpnorm_param             lpnorm;
     void*                         client_param;
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_ops.h b/src/tim/vx/internal/include/vsi_nn_ops.h
index 40671e7..de26f0d 100644
--- a/src/tim/vx/internal/include/vsi_nn_ops.h
+++ b/src/tim/vx/internal/include/vsi_nn_ops.h
@@ -48,7 +48,7 @@ extern "C"{
  * @see include/custom/custom_ops.def
  * @see include/internal/internal_ops.def
  */
-typedef int32_t vsi_nn_op_t; enum
+typedef int32_t VSI_PUBLIC_TYPE vsi_nn_op_t; enum
 {
 #define DEF_OP( NAME, ... ) VSI_NN_OP_##NAME,
     #include "interface/ops.def"
@@ -126,7 +126,7 @@ typedef struct _vsi_nn_op_proc
     vsi_nn_op_optimize_t optimize;
     uint32_t            input_num;
     uint32_t            output_num;
-} vsi_nn_op_proc_t;
+} VSI_PUBLIC_TYPE vsi_nn_op_proc_t;
 
 /*------------------------------------
               Functions
diff --git a/src/tim/vx/internal/include/vsi_nn_platform.h b/src/tim/vx/internal/include/vsi_nn_platform.h
index fc41e9f..f5548c8 100644
--- a/src/tim/vx/internal/include/vsi_nn_platform.h
+++ b/src/tim/vx/internal/include/vsi_nn_platform.h
@@ -26,13 +26,6 @@
 
 #include "vsi_nn_feature_config.h"
 
-#ifdef VSI_40BIT_VA_SUPPORT
-#ifdef VX_VA40_EXT_SUPPORT
-#undef VX_VA40_EXT_SUPPORT
-#endif
-#define VX_VA40_EXT_SUPPORT 1
-#endif
-
 #include <VX/vx_khr_cnn.h>
 #include <VX/vx_helper.h>
 #include <VX/vx_ext_program.h>
@@ -48,12 +41,4 @@
 */
 #include "vsi_nn_compatibility.h"
 
-#if defined(__cplusplus)
-extern "C"{
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
index 5da4b82..227b17f 100644
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@@ -87,6 +87,7 @@ typedef enum
     VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP,
     VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422,
     VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422,
+    VSI_NN_SOURCE_FORMAT_IMAGE_NV21,
 } vsi_nn_preprocess_source_format_e;
 
 /**
@@ -98,7 +99,7 @@ typedef struct
     vsi_nn_preprocess_type_e type;
     /** Preprocess paramters */
     void* param;
-} vsi_nn_preprocess_base_t;
+} VSI_PUBLIC_TYPE vsi_nn_preprocess_base_t;
 
 /**
  * Postprocess base structure
@@ -109,7 +110,7 @@ typedef struct
     vsi_nn_postprocess_type_e type;
     /** Postrocess paramters */
     void* param;
-} vsi_nn_postprocess_base_t;
+} VSI_PUBLIC_TYPE vsi_nn_postprocess_base_t;
 
 /**
  * Process dtype convert parameter structure
diff --git a/src/tim/vx/internal/include/vsi_nn_pub.h b/src/tim/vx/internal/include/vsi_nn_pub.h
index d36f570..48525a4 100644
--- a/src/tim/vx/internal/include/vsi_nn_pub.h
+++ b/src/tim/vx/internal/include/vsi_nn_pub.h
@@ -44,6 +44,7 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_version.h"
 #include "vsi_nn_assert.h"
+#include "vsi_nn_post.h"
 #include "vsi_nn_rnn.h"
 #include "vsi_nn_test.h"
 #include "vsi_nn_pre_post_process.h"
diff --git a/src/tim/vx/internal/include/vsi_nn_rnn.h b/src/tim/vx/internal/include/vsi_nn_rnn.h
index 519d783..21d4009 100644
--- a/src/tim/vx/internal/include/vsi_nn_rnn.h
+++ b/src/tim/vx/internal/include/vsi_nn_rnn.h
@@ -44,7 +44,7 @@ typedef struct
 {
     vsi_nn_tensor_id_t output;
     vsi_nn_tensor_id_t inputs[VSI_NN_MAX_RNN_CONNECTION_INPUTS];
-} vsi_nn_rnn_external_connection_t;
+} VSI_PUBLIC_TYPE vsi_nn_rnn_external_connection_t;
 
 /*-------------------------------------------
 Procedure to prepare input data, return FALSE
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h
index 7a33586..5b7bdb9 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@@ -63,7 +63,7 @@ typedef enum
     VSI_NN_DIM_FMT_NHWC = 0x01,
     VSI_NN_DIM_FMT_NA   = 0xFF,
     VSI_NN_DIM_FMT_AUTO = VSI_NN_DIM_FMT_NA - 1,
-} vsi_nn_dim_fmt_e;
+} VSI_PUBLIC_TYPE vsi_nn_dim_fmt_e;
 
 /**
  * Quantization type.
@@ -125,7 +125,7 @@ typedef struct vsi_nn_dtype
 #endif
         };
     };
-} vsi_nn_dtype_t;
+} VSI_PUBLIC_TYPE vsi_nn_dtype_t;
 
 /**
  * Tensor Attribute
@@ -150,15 +150,13 @@ typedef struct vsi_nn_tensor_attr
 #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL
     vsi_memory_type_e vsi_memory_type;
 #endif
-#if VX_STREAM_PROCESSOR_SUPPORT
-    vsi_bool     is_dummy;
-#endif
-} vsi_nn_tensor_attr_t;
+    // DO NOT modify this struct.
+} VSI_PUBLIC_TYPE vsi_nn_tensor_attr_t;
 
 /**
  * Tensor structure
  */
-struct _vsi_nn_tensor
+struct VSI_PUBLIC_TYPE _vsi_nn_tensor
 {
     /** Tensor attributes */
     vsi_nn_tensor_attr_t attr;
@@ -168,6 +166,7 @@ struct _vsi_nn_tensor
     vx_weights_biases_parameter wb;
     /** Mark tensor swapped by vxSwapTensor */
     int8_t  is_swapped;
+    // DO NOT modify this struct.
 };
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
index 1083d21..4b997f3 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@@ -321,10 +321,38 @@ OVXLIB_API vsi_status vsi_nn_CopyDataToTensor
     );
 
 /**
- * Flush Handle
- * If you swap the handle of the tensor, you should flush it.
+ * Swap a tensor's Handle
+ * Swap handle to old_ptr to read/write, swap new handle to new_ptr to update handle.
  *
- * @param[in] tensor Tensor handle.
+ * APP SHOULD maintain handle that created by itself to manage memory correctly,
+ * never free or wirte data for handel allocated by OVXLIB.
+ *
+ * OVXLIB would not maintain original handle anymore if new_ptr == NULL.
+ *
+ * Before free data in handle allocated by APP, vsi_nn_SwapHandle(tensor, NULL, &prev_ptr)
+ * should be called to get contol of handle.
+ *
+ * @param[in] tensor Tensor.
+ * @param[in] new_ptr New handle of tensor.
+ * @param[in] is_new_ptr_malloc_by_ovxlib If new_ptr is allocated by ovxlib while new_ptr is not NULL.
+ * @param[out] old_ptr Old handle of tensor.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+OVXLIB_API vsi_status vsi_nn_SwapHandle
+(
+    vsi_nn_tensor_t* tensor,
+    void* new_ptr,
+    vsi_bool is_new_ptr_malloc_by_ovxlib,
+    void** old_ptr
+);
+
+/**
+ * Flush Handle
+ * Call this function to flush new data to the handle in hand.
+ * vsi_nn_FlushHandle() should be called at last to compleate the data writting operation.
+ *
+ * @param[in] tensor Tensor.
  *
  * @return VSI_SUCCESS on success, or error core otherwise.
  */
@@ -333,6 +361,20 @@ OVXLIB_API vsi_status vsi_nn_FlushHandle
     const vsi_nn_tensor_t * tensor
     );
 
+/**
+ * Invalidate Handle
+ * invalidate handle before copy data from tensor handle.
+ * Before read data in handle, vsi_nn_InvalidateHandle() should be called to do invalidate cache in APP.
+ *
+ * @param[in] tensor Tensor.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+OVXLIB_API vsi_status vsi_nn_InvalidateHandle
+(
+    const vsi_nn_tensor_t* tensor
+);
+
 /**
  * Get Tensor Handle
  * Get the handle of the tensor
@@ -348,6 +390,34 @@ OVXLIB_API vsi_status vsi_nn_GetTensorHandle
     void** ptr
     );
 
+/**
+ * Get Tensor is_scalar
+ * Get the is_scalar of the tensor
+ *
+ * @param[in] tensor Tensor.
+ *
+ * @return is_scalar flag of the tensor.
+ */
+OVXLIB_API int8_t vsi_nn_GetTensorIsScalar
+(
+    vsi_nn_tensor_t* tensor
+);
+
+/**
+ * Set Tensor is_scalar
+ * Set the is_scalar for the tensor
+ *
+ * @param[in] tensor Tensor.
+ * @param[in] new is_scalar value of the tensor.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+OVXLIB_API vsi_status vsi_nn_SetTensorIsScalar
+(
+    vsi_nn_tensor_t* tensor,
+    int8_t is_scalar
+);
+
 OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor
     (
     vsi_nn_graph_t*         graph,
@@ -722,13 +792,6 @@ vsi_nn_tensor_t* vsi_nn_ConstTensorAdd_impl
 #define vsi_nn_ConstTensorAdd(_graph, _output_attr, ...) \
     vsi_nn_ConstTensorAdd_impl(_graph, _output_attr, __VA_ARGS__, END_OF_VARIADIC_ARGUMENTS)
 
-vsi_status vsi_nn_SwapHandle
-    (
-    vsi_nn_tensor_t * tensor,
-    void * new_ptr,
-    void ** old_ptr
-    );
-
 vsi_bool vsi_nn_ConvertTensor
     (
     vsi_nn_graph_t* graph,
diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h
index 0a655c1..6238e4f 100644
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@@ -27,7 +27,6 @@
 
 #include <stdint.h>
 #include "vsi_nn_platform.h"
-#include "vsi_nn_feature_config.h"
 
 #if defined(__cplusplus)
 extern "C"{
@@ -109,7 +108,7 @@ typedef enum
     VSI_NN_PAD_AUTO,
     VSI_NN_PAD_VALID,
     VSI_NN_PAD_SAME
-} vsi_nn_pad_e;
+} VSI_PUBLIC_TYPE vsi_nn_pad_e;
 
 /** reduce type enum */
 typedef enum
@@ -142,14 +141,14 @@ typedef enum
 {
     VSI_NN_ROUND_CEIL,
     VSI_NN_ROUND_FLOOR
-} vsi_nn_round_type_e;
+} VSI_PUBLIC_TYPE vsi_nn_round_type_e;
 
 /** Optimize driction */
 typedef enum
 {
     VSI_NN_OPTIMIZE_FORWARD,
     VSI_NN_OPTIMIZE_BACKWARD
-} vsi_nn_opt_direction_e;
+} VSI_PUBLIC_TYPE vsi_nn_opt_direction_e;
 #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL
 typedef enum
 {
@@ -195,7 +194,7 @@ typedef enum
 #endif
     VSI_NN_TYPE_VDATA = VX_TYPE_USER_STRUCT_START + 0x1,
 
-}vsi_nn_type_e;
+} VSI_PUBLIC_TYPE vsi_nn_type_e;
 
 typedef int32_t vsi_nn_activation_e; enum
 {
@@ -236,7 +235,7 @@ typedef enum
 {
     VSI_NN_GRAPH_PRELOAD_VIPSRAM,
     VSI_NN_GRAPH_PRELOAD_AXISRAM
-} vsi_nn_graph_attr_preload_type_e;
+} VSI_PUBLIC_TYPE vsi_nn_graph_attr_preload_type_e;
 
 typedef enum _vsi_nn_node_attr_preload_type_e
 {
@@ -257,23 +256,35 @@ typedef enum _vsi_nn_yuv_type
     VSI_NN_YUV_TYPE_UYUV422
 }vsi_nn_yuv_type;
 
+typedef enum _vsi_nn_nv_type
+{
+    VSI_NN_YUV_TYPE_NV12,
+    VSI_NN_YUV_TYPE_NV21
+}vsi_nn_nv_type;
+
+typedef enum _vsi_nn_roi_align_type_e
+{
+    VSI_NN_ROI_ALIGN_ANDROID,
+    VSI_NN_ROI_ALIGN
+} vsi_nn_roi_align_type_e;
+
 /** Deprecated */
 typedef uint32_t vsi_nn_size_t;
 
 /** Tensor id type */
-typedef uint32_t vsi_nn_tensor_id_t;
+typedef uint32_t VSI_PUBLIC_TYPE vsi_nn_tensor_id_t;
 
 /** Node id type */
 typedef uint32_t vsi_nn_node_id_t;
 
 /** @see _vsi_nn_graph */
-typedef struct _vsi_nn_graph vsi_nn_graph_t;
+typedef struct _vsi_nn_graph VSI_PUBLIC_TYPE vsi_nn_graph_t;
 
 /** @see _vsi_nn_node */
-typedef struct _vsi_nn_node vsi_nn_node_t;
+typedef struct _vsi_nn_node VSI_PUBLIC_TYPE vsi_nn_node_t;
 
 /** @see _vsi_nn_tensor */
-typedef struct _vsi_nn_tensor vsi_nn_tensor_t;
+typedef struct _vsi_nn_tensor VSI_PUBLIC_TYPE vsi_nn_tensor_t;
 
 #if defined(__cplusplus)
 }
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 5079bfe..280f0cc 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 57
+#define VSI_NN_VERSION_PATCH 74
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/Android.mk b/src/tim/vx/internal/src/Android.mk
new file mode 100644
index 0000000..a1b3683
--- /dev/null
+++ b/src/tim/vx/internal/src/Android.mk
@@ -0,0 +1,144 @@
+#
+# Build Vivante chipinfo for android.
+#
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+ifeq ($(AQROOT),)
+$(error Please set AQROOT env first)
+endif
+
+include $(AQROOT)/Android.mk.def
+
+ifeq ($(PLATFORM_VENDOR),1)
+LOCAL_VENDOR_MODULE  := true
+endif
+
+LOCAL_SRC_FILES :=     \
+            vsi_nn_context.c \
+            vsi_nn_client_op.c \
+            vsi_nn_graph.c  \
+            vsi_nn_node_attr_template.c  \
+            vsi_nn_node.c  \
+            vsi_nn_ops.c  \
+            vsi_nn_daemon.c \
+            vsi_nn_tensor.c \
+            vsi_nn_version.c \
+            vsi_nn_rnn.c \
+            vsi_nn_rnn_helper.c \
+            vsi_nn_internal_node.c \
+            vsi_nn_log.c \
+            vsi_nn_graph_optimization.c \
+            vsi_nn_pre_post_process.c
+
+
+LOCAL_SRC_FILES +=      \
+             utils/vsi_nn_code_generator.c   \
+             utils/vsi_nn_binary_tree.c   \
+             utils/vsi_nn_map.c   \
+             utils/vsi_nn_hashmap.c   \
+             utils/vsi_nn_link_list.c   \
+             utils/vsi_nn_math.c   \
+             utils/vsi_nn_dtype.c   \
+             utils/vsi_nn_dtype_util.c   \
+             utils/vsi_nn_shape_util.c   \
+             utils/vsi_nn_limits.c   \
+             utils/vsi_nn_tensor_op.c   \
+             utils/vsi_nn_util.c \
+             utils/vsi_nn_dlfcn.c \
+             utils/vsi_nn_constraint_check.c
+
+
+LOCAL_SRC_FILES +=      \
+             quantization/vsi_nn_dynamic_fixed_point.c   \
+             quantization/vsi_nn_asymmetric_affine.c   \
+             quantization/vsi_nn_perchannel_symmetric_affine.c   \
+
+
+LOCAL_SRC_FILES +=      \
+            post/vsi_nn_post_fasterrcnn.c   \
+            post/vsi_nn_post_cmupose.c
+
+LOCAL_SRC_FILES +=      \
+            cpu_backend/vsi_nn_cpu_backend.c   \
+            cpu_backend/vsi_nn_cpu_backend_conv2d.c   \
+            cpu_backend/vsi_nn_cpu_backend_deconv2d.c   \
+            cpu_backend/npuref_interface.c
+
+
+LOCAL_SRC_FILES += libnnext/vsi_nn_libnnext_resource.c \
+                   libnnext/vsi_nn_vxkernel.c
+
+LOCAL_SRC_FILES += kernel/vsi_nn_kernel.c \
+                   kernel/vsi_nn_kernel_util.c \
+                   kernel/vsi_nn_kernel_backend.c \
+                   kernel/vsi_nn_kernel_eltwise.c \
+                   kernel/vsi_nn_kernel_selector.c \
+                   kernel/vsi_nn_kernel_node.c \
+                   kernel/vsi_nn_kernel_param.c \
+                   kernel/vsi_nn_kernel_gpu_shape_optimize.c \
+                   kernel/vsi_nn_kernel_lut.c \
+                   kernel/vsi_nn_spinst.c \
+                   kernel/vsi_nn_sp_unit_operation.c \
+                   kernel/vsi_nn_sp_lut.c \
+                   kernel/vsi_nn_gpu.c
+
+LIBNNEXT_KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/libnnext/ops/kernel/*.c)
+LOCAL_SRC_FILES += $(LIBNNEXT_KERNEL_SOURCES:$(LOCAL_PATH)/%=%)
+
+KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/kernel/cl/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/cpu/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/evis/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/vx/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/kernel/sp/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/kernel/evis/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/kernel/cl/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/kernel/cpu/*.c)
+KERNEL_SOURCES += $(wildcard $(LOCAL_PATH)/custom/ops/kernel/sp/*.c)
+LOCAL_SRC_FILES += $(KERNEL_SOURCES:$(LOCAL_PATH)/%=%)
+
+OPERATION_SOURCES := $(wildcard $(LOCAL_PATH)/ops/*.c)
+LOCAL_SRC_FILES += $(OPERATION_SOURCES:$(LOCAL_PATH)/%=%)
+
+
+LOCAL_SHARED_LIBRARIES := \
+    liblog \
+    libjpeg \
+    libGAL \
+    libOpenVX \
+    libVSC \
+    libdl
+
+LOCAL_C_INCLUDES += \
+    external/libjpeg-turbo \
+    $(AQROOT)/sdk/inc/CL \
+    $(AQROOT)/sdk/inc/VX \
+    $(AQROOT)/sdk/inc/ \
+    $(AQROOT)/sdk/inc/HAL \
+    $(LOCAL_PATH)/../include \
+    $(LOCAL_PATH)/../include/ops \
+    $(LOCAL_PATH)/../include/utils \
+    $(LOCAL_PATH)/../include/infernce \
+    $(LOCAL_PATH)/../include/client \
+    $(LOCAL_PATH)/../include/cpu_backend \
+    $(LOCAL_PATH)/../include/libnnext \
+    $(LOCAL_PATH)/../src
+
+LOCAL_CFLAGS :=  \
+    -DLINUX \
+    -D'OVXLIB_API=__attribute__((visibility("default")))' \
+    -DANDROID_SDK_VERSION=$(PLATFORM_SDK_VERSION)\
+        -Wno-sign-compare \
+        -Wno-implicit-function-declaration \
+        -Wno-sometimes-uninitialized \
+        -Wno-unused-parameter \
+        -Wno-enum-conversion \
+        -Wno-missing-field-initializers \
+        -Wno-tautological-compare \
+        -Wno-missing-braces
+
+LOCAL_MODULE:= libovxlib
+LOCAL_MODULE_TAGS := optional
+LOCAL_PRELINK_MODULE := false
+include $(BUILD_SHARED_LIBRARY)
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
new file mode 100644
index 0000000..a1e50a4
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
@@ -0,0 +1,184 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <stdlib.h>
+#include <math.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_test.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+
+#define _CPU_ARG_NUM            (1)
+#define _CPU_INPUT_NUM          (2)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            ("com.vivantecorp.extension.CustomSampleVXC")
+
+#define SCALAR_INPUT_AXIS          (3)
+
+__BEGIN_DECLS
+
+DEF_KERNEL_EXECUTOR(_softmax_compute)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t* param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_SUCCESS;
+    float *buffer[_CPU_IO_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *attr[_CPU_IO_NUM] = {NULL};
+    uint32_t i = 0, out_elements = 0;
+    int32_t axis;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; // input0
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; // input1
+    tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; // output
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create(tensors[0]);
+    attr[1] = vsi_nn_kernel_tensor_attr_create(tensors[1]);
+    attr[2] = vsi_nn_kernel_tensor_attr_create(tensors[2]);
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    /* alloc the float32 data buffer */
+    buffer[0] = (float *)vsi_nn_kernel_tensor_create_buffer(tensors[0], attr[0], TRUE);
+    CHECK_PTR_FAIL_GOTO(buffer[0], "Create input0 buffer fail.", final);
+
+    buffer[1] = (float *)vsi_nn_kernel_tensor_create_buffer(tensors[1], attr[1], TRUE);
+    CHECK_PTR_FAIL_GOTO(buffer[1], "Create input1 buffer fail.", final);
+
+    out_elements = (uint32_t)vsi_nn_kernel_tensor_attr_get_size(attr[2]);
+    buffer[2] = (float *)malloc(out_elements * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
+    memset(buffer[2], 0, out_elements * sizeof(float));
+
+    /* CPU implement */
+    for(i = 0; i < out_elements; i++)
+    {
+        buffer[2][i] = buffer[0][i] + buffer[1][0];
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float(
+        tensors[2], attr[2], buffer[2], out_elements );
+final:
+    for(i = 0; i < _CPU_IO_NUM; i ++)
+    {
+        if(buffer[i])
+        {
+            free(buffer[i]);
+        }
+        vsi_nn_kernel_tensor_attr_release(&attr[i]);
+    }
+    return status;
+}
+
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
+};
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _softmax_compute,
+    kernel_param_def,
+    _cnt_of_array( kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = 0;
+
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    status = _query_kernel(inputs, outputs, kernel);
+    if(status != VSI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    node = vsi_nn_kernel_create_node(graph, kernel);
+    if(node == NULL)
+    {
+        return NULL;
+    }
+
+    /* Set inputs and outputs */
+    vsi_nn_kernel_node_pack_io(backend_params, _CPU_PARAM_NUM,
+            inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM);
+    backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
+            graph, I32, &axis);
+
+    /* Pass parameters to node. */
+    status = vsi_nn_kernel_node_pass_param(node, backend_params, _CPU_PARAM_NUM);
+    vsi_nn_kernel_scalar_release(&backend_params[SCALAR_INPUT_AXIS]);
+
+    return node;
+}
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( custom_sample, _setup )
diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_sample.c b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
new file mode 100644
index 0000000..1459539
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
@@ -0,0 +1,103 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <stdlib.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_log.h"
+#include "kernel/vsi_nn_kernel.h"
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_kernel_param_t *param = NULL;
+    vsi_nn_custom_sample_param *p;
+    p = &self->nn_param.custom_sample;
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32(param, "axis", p->axis);
+
+    self->n = (vx_node)vsi_nn_kernel_selector(
+            self->graph,
+            "custom_sample",
+            inputs, 2,
+            outputs, 1,
+            param);
+
+    vsi_nn_kernel_param_release(&param);
+    return VSI_SUCCESS;
+}
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check params. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * node,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        memmove(outputs[0]->attr.size, inputs[0]->attr.size,
+            inputs[0]->attr.dim_num * sizeof(vsi_size_t));
+    }
+    return TRUE;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_SAMPLE,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ 2,
+    /* output_num */ 1
+    );
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
new file mode 100644
index 0000000..c0ed53e
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
@@ -0,0 +1,354 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_AVG_POOL3D,
+} _internal_kernel_e;
+
+#define _AVG_POOL3D_KERNEL_SOURCE_NAME      "avg_pool3d"
+
+// Add kernel hashtable here
+#define AVG_POOL3D_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+#define AVG_POOL3D_KERNELS( IN_DTYPE, OUT_DTYPE ) \
+        { AVG_POOL3D_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("cl.avg_pool3d_"#IN_DTYPE"to"#OUT_DTYPE), \
+        _AVG_POOL3D_KERNEL_SOURCE_NAME }, \
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _avg_pool3d_kernel_map[] =
+{
+    // Register kernel here
+    AVG_POOL3D_KERNELS( F32, F32 )
+    AVG_POOL3D_KERNELS( F32, U32 )
+    AVG_POOL3D_KERNELS( F32, I32 )
+    AVG_POOL3D_KERNELS( U32, U32 )
+    AVG_POOL3D_KERNELS( U32, F32 )
+    AVG_POOL3D_KERNELS( I32, I32 )
+    AVG_POOL3D_KERNELS( I32, F32 )
+    AVG_POOL3D_KERNELS( BF16, BF16 )
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _avg_pool3d_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _AVG_POOL3D_PARAM_NUM  _cnt_of_array( _avg_pool3d_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_avg_pool3d_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output = (vx_tensor)param[1];
+    vx_scalar  depth_out = (vx_scalar)param[14];
+    int32_t depth_out_value;
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    vxReadScalarValue(depth_out, &depth_out_value);
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = depth_out_value;
+    gpu_param.global_size[0]   = (output_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0];
+    gpu_param.global_size[1]   = (output_shape->data[1] +  gpu_param.global_scale[1] - 1)
+                                        /  gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (output_shape->data[2] +  gpu_param.global_scale[2] - 1)
+                                        /  gpu_param.global_scale[2];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _avg_pool3d_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _avg_pool3d_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _avg_pool3d_kernel_map );
+    vx_param_description_t * param_def  = _avg_pool3d_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _avg_pool3d_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
+     (( in_dtype ) | (out_dtype << 8 ))
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+    case _PACK_SELECT_KEY(F32, F16):
+    case _PACK_SELECT_KEY(F16, F32):
+         key = AVG_POOL3D_HASH_KEY( F32, F32);
+         break;
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+         key = AVG_POOL3D_HASH_KEY( F32, U32);
+         break;
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+         key = AVG_POOL3D_HASH_KEY( F32, I32);
+         break;
+    case _PACK_SELECT_KEY(U8, U8):
+         key = AVG_POOL3D_HASH_KEY( U32, U32);
+         break;
+    case _PACK_SELECT_KEY(U8, F16):
+    case _PACK_SELECT_KEY(U8, F32):
+         key = AVG_POOL3D_HASH_KEY( U32, F32);
+         break;
+    case _PACK_SELECT_KEY(I8, I8):
+    case _PACK_SELECT_KEY(I8, I16):
+    case _PACK_SELECT_KEY(I16, I8):
+    case _PACK_SELECT_KEY(I16, I16):
+         key = AVG_POOL3D_HASH_KEY( I32, I32);
+         break;
+    case _PACK_SELECT_KEY(I8, F16):
+    case _PACK_SELECT_KEY(I8, F32):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I16, F32):
+         key = AVG_POOL3D_HASH_KEY( I32, F32);
+         break;
+    default:
+         key = AVG_POOL3D_HASH_KEY( in_dtype, out_dtype);
+         break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _avg_pool3d_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_AVG_POOL3D_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t width     = (int32_t)inputs[0]->attr.size[0];
+    int32_t height    = (int32_t)inputs[0]->attr.size[1];
+    int32_t ksize_x   = vsi_nn_kernel_param_get_int32(params, "ksize_x");
+    int32_t ksize_y   = vsi_nn_kernel_param_get_int32(params, "ksize_y");
+    int32_t ksize_z   = vsi_nn_kernel_param_get_int32(params, "ksize_z");
+    int32_t stride_x  = vsi_nn_kernel_param_get_int32(params, "stride_x");
+    int32_t stride_y  = vsi_nn_kernel_param_get_int32(params, "stride_y");
+    int32_t stride_z  = vsi_nn_kernel_param_get_int32(params, "stride_z");
+    int32_t pad_left  = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_top   = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t pad_front = vsi_nn_kernel_param_get_int32(params, "pad_front");
+    int32_t depth_in  = vsi_nn_kernel_param_get_int32(params, "depth_in");
+    int32_t depth_out = vsi_nn_kernel_param_get_int32(params, "depth_out");
+    int32_t count_include_pad = vsi_nn_kernel_param_get_int32(params, "count_include_pad");
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 2;
+            vsi_nn_kernel_node_pack_io( node_params, _AVG_POOL3D_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_z );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_z );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_front );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &depth_in );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &depth_out );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &count_include_pad );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _AVG_POOL3D_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+            vsi_nn_kernel_scalar_release( &node_params[14] );
+            vsi_nn_kernel_scalar_release( &node_params[15] );
+            vsi_nn_kernel_scalar_release( &node_params[16] );
+            vsi_nn_kernel_scalar_release( &node_params[17] );
+            vsi_nn_kernel_scalar_release( &node_params[18] );
+            vsi_nn_kernel_scalar_release( &node_params[19] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( avg_pool3d, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
new file mode 100644
index 0000000..bda96ff
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
@@ -0,0 +1,381 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_BILINEAR_GRID_SAMPLE,
+} _internal_kernel_e;
+
+#define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()      "bilinear_grid_sample"
+
+#define STR(a) #a
+
+// Add kernel hashtable here
+#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+
+#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE)                \
+    {                                                                   \
+        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
+            CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+            _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()   \
+    }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP(F32, F32, F32 ),
+    PACK_KERNEL_MAP(U8,  U8,  U8),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _BILINEAR_GRID_SAMPLE_PARAM_NUM 8
+#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM \
+    _cnt_of_array(_bilinear_grid_sample_kernel_param_def)
+
+#define SCALAR_HALF_INPUT0_W (3)
+#define SCALAR_HALF_INPUT0_H (4)
+#define SCALAR_ADD_VALUE_W   (5)
+#define SCALAR_ADD_VALUE_H   (6)
+#define SCALAR_DEPTH         (7)
+#define SCALAR_INPUT0_SCALE  (8)
+#define SCALAR_INPUT0_TAIL   (9)
+#define SCALAR_INPUT1_SCALE  (10)
+#define SCALAR_INPUT1_TAIL   (11)
+#define SCALAR_OUTPUT_SCALE  (12)
+#define SCALAR_OUTPUT_TAIL   (13)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
+    vsi_size_array_t* out_shape = NULL;
+
+    output_attr =
+        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+    CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
+
+    out_shape = output_attr->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.dim = 2;
+    gpu_param.global_size[0] =
+        gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) /
+                         gpu_param.global_scale[0],
+                     4);
+    gpu_param.global_size[1] =
+        ((out_shape->data[1] + gpu_param.global_scale[1] - 1) /
+         gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = 1;
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR)               \
+    if (_PTR) {                                   \
+        vsi_nn_kernel_tensor_attr_release(&_PTR); \
+        _PTR = NULL;                              \
+    }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _bilinear_grid_sample_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool* is_use_u8_kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype, in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _bilinear_grid_sample_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _bilinear_grid_sample_kernel_map );
+    vx_param_description_t * param_def  = _bilinear_grid_sample_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array(_bilinear_grid_sample_kernel_param_def);
+    vx_kernel_initialize_f  initializer = _bilinear_grid_sample_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in0_dtype) {
+        in0_dtype = F32;
+    }
+    if (F16 == in1_dtype) {
+        in1_dtype = F32;
+    }
+    if (F16 == out_dtype) {
+        out_dtype = F32;
+    }
+    if ((U8 == in0_dtype) || (U8 == out_dtype)) {
+        param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+        *is_use_u8_kernel = TRUE;
+    } else {
+        param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
+        *is_use_u8_kernel = FALSE;
+    }
+
+    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM];
+    vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+    uint32_t final_in1_rank = 0;
+    vsi_nn_tensor_t* rs_tensors = NULL;
+    vsi_nn_tensor_t* final_tensors[3] = {NULL};
+    vsi_size_t in0_width  = inputs[0]->attr.size[0];
+    vsi_size_t in0_height = inputs[0]->attr.size[1];
+    float input0_zp    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0_tail  = -(input0_zp * input0_scale);
+    float input1_zp    = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1_tail  = -(input1_zp * input1_scale);
+    float output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    vsi_bool is_use_u8_kernel = FALSE;
+    int32_t align_corners =
+        vsi_nn_kernel_param_get_int32(params, "align_corners");
+    uint32_t pad_val = 0;
+    int32_t  depth = 0;
+    vsi_nn_kernel_dtype_e in0_dtype;
+
+    float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h;
+
+    // Check if gpu can support the size
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
+                                       inputs[0]->attr.dim_num)) {
+        return NULL;
+    }
+
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[1]->attr.size,
+                                       inputs[1]->attr.dim_num)) {
+        return NULL;
+    }
+
+    final_tensors[0] = inputs[0];
+
+    if (inputs[1]->attr.dim_num >= 3) {
+
+        final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
+        final_shape[1] = inputs[1]->attr.size[2];
+        final_shape[2] = 1;
+        final_shape[3] = inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
+        final_in1_rank =
+            inputs[1]->attr.dim_num == 3 ? 2 : inputs[1]->attr.dim_num;
+        if (!vsi_nn_kernel_gpu_check_shape(final_shape, final_in1_rank)) {
+            return NULL;
+        }
+
+        rs_tensors = vsi_nn_reshape_tensor(graph, inputs[1], final_shape, final_in1_rank);
+        final_tensors[1] = rs_tensors;
+    } else {
+        final_tensors[1] = inputs[1];
+    }
+    final_tensors[2] = outputs[0];
+
+    if (align_corners) {
+        half_input0_w     = ((float)in0_width - 1.0f) * 0.5f;
+        half_input0_h     = ((float)in0_height - 1.0f) * 0.5f;
+        add_float_value_w = half_input0_w;
+        add_float_value_h = half_input0_h;
+    } else {
+        half_input0_w     = (float)in0_width * 0.5f;
+        half_input0_h     = (float)in0_height * 0.5f;
+        add_float_value_w = half_input0_w - 0.5f;
+        add_float_value_h = half_input0_h - 0.5f;
+    }
+
+    depth = (int32_t)inputs[0]->attr.size[2];
+    in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    if (U8 == in0_dtype) {
+        pad_val = inputs[0]->attr.dtype.zero_point;
+    }
+    status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            size_t node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM,
+                    final_tensors, input_num, &final_tensors[2], output_num );
+            node_params[SCALAR_HALF_INPUT0_W] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_w );
+            node_params[SCALAR_HALF_INPUT0_H] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_h );
+            node_params[SCALAR_ADD_VALUE_W]   = vsi_nn_kernel_scalar_create( graph, F32, &add_float_value_w );
+            node_params[SCALAR_ADD_VALUE_H]   = vsi_nn_kernel_scalar_create( graph, F32, &add_float_value_h );
+            node_params[SCALAR_DEPTH]         = vsi_nn_kernel_scalar_create( graph, I32, &depth );
+            if (is_use_u8_kernel)
+            {
+                node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
+                node_params[SCALAR_INPUT0_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
+                node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
+                node_params[SCALAR_INPUT1_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
+                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+                node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+            }
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            VSI_ASSERT(status == VSI_SUCCESS);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_W]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_HALF_INPUT0_H]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]);
+            if (is_use_u8_kernel) {
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]);
+                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]);
+            }
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U32 = pad_val;
+                status = vxSetNodeAttribute(
+                    (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
+                CHECK_STATUS(status);
+            }
+        }
+    }
+
+    vsi_safe_release_tensor(rs_tensors);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( bilinear_grid_sample, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
index 38defcc..4b518b2 100644
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@@ -35,6 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
 
@@ -258,19 +259,36 @@ static vsi_nn_kernel_node_t _setup
     float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     float    min_value    = vsi_nn_kernel_param_get_float32( params, "min_value" );
     float    max_value    = vsi_nn_kernel_param_get_float32( params, "max_value" );
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;
 
-    outputScale = 1.0f / outputScale;
-    inputTail   = -(inputTail * inputScale);
+    ret = vsi_nn_kernel_optimize_element_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
 
-    if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
-                inputs[0]->attr.dim_num ) )
+    if ( ret )
     {
         return NULL;
     }
 
-    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shape, new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape, new_rank );
 
-    status = _query_kernel( kernel, inputs, outputs, image_2d);
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
+
+    if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
+                reshape_tensors[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1);
+
+    status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[1], image_2d);
 
     if ( VSI_SUCCESS == status )
     {
@@ -279,7 +297,7 @@ static vsi_nn_kernel_node_t _setup
         {
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    reshape_tensors, input_num, &reshape_tensors[1], output_num );
             node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value );
             node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value );
             node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
@@ -297,6 +315,10 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
         }
     }
+
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
index 4be70d9..8fec39b 100644
--- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
 
 __BEGIN_DECLS
 
@@ -287,7 +288,7 @@ static vsi_status _query_kernel
     int i;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
     if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8)
@@ -335,31 +336,85 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
-    int32_t operation = 0;
+    int32_t operation = vsi_nn_kernel_param_get_int32( params, "operation" );
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = FALSE;
 
     float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
     float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
     float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
     float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if ( ret )
     {
-        return NULL;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                inputs[1], shapes[1], new_rank );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[2], new_rank );
+
+#define _swap_tensor(a, b, tmp)  \
+    do { \
+        tmp = a; \
+        a = b; \
+        b = tmp; \
+    } while(0)
+
+        if (shapes[1][3] > shapes[0][3] && new_rank == 4)
+        {
+            vsi_nn_tensor_t* reshape_tmp;
+            _swap_tensor(reshape_tensors[0], reshape_tensors[1], reshape_tmp);
+
+            if (VSI_NN_RELATIONAL_OPS_GREAT == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_LESS;
+            }
+            else if (VSI_NN_RELATIONAL_OPS_LESS == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_GREAT;
+            }
+            else if (VSI_NN_RELATIONAL_OPS_GREAT_EQUAL == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_LESS_EQUAL;
+            }
+            else if (VSI_NN_RELATIONAL_OPS_LESS_EQUAL == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_GREAT_EQUAL;
+            }
+        }
+
+#undef _swap_tensor
+    }
+    else
+    {
+        goto final;
     }
 
-    operation = vsi_nn_kernel_param_get_int32( params, "operation" );
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
+    }
 
-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( inputs, outputs, operation, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2 || reshape_tensors[2]->attr.size[2] == 1);
+    status = _query_kernel( reshape_tensors, &reshape_tensors[2], operation, image_2d, kernel );
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
 
-        if( node )
+        if ( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
-                    inputs, 2, outputs, 1 );
+                    reshape_tensors, 2, &reshape_tensors[2], 1 );
             node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
                     graph, F32, &input0Scale );
             node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
@@ -379,6 +434,12 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
         }
     }
+
+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[2] );
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
index 91746ab..0aac099 100644
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@@ -69,14 +69,19 @@ static const struct {
 {
     HASH_CUMSUM_KERNELS(0, U8,  U8)
     HASH_CUMSUM_KERNELS(0, F32, F32)
+    HASH_CUMSUM_KERNELS(0, F32, U8)
     HASH_CUMSUM_KERNELS(1, U8,  U8)
     HASH_CUMSUM_KERNELS(1, F32, F32)
+    HASH_CUMSUM_KERNELS(1, F32, U8)
     HASH_CUMSUM_KERNELS(2, U8,  U8)
     HASH_CUMSUM_KERNELS(2, F32, F32)
+    HASH_CUMSUM_KERNELS(2, F32, U8)
     HASH_CUMSUM_KERNELS_2D(0, U8,  U8)
     HASH_CUMSUM_KERNELS_2D(0, F32, F32)
+    HASH_CUMSUM_KERNELS_2D(0, F32, U8)
     HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
     HASH_CUMSUM_KERNELS_2D(1, F32, F32)
+    HASH_CUMSUM_KERNELS_2D(1, F32, U8)
 };
 
 /*
diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
index 7e1d681..5d29c67 100644
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@@ -56,6 +56,10 @@ typedef enum
     UNARY_RCP,
     UNARY_SIGN,
     UNARY_SOFTSIGN,
+    UNARY_ATAN,
+    UNARY_ATANH,
+    UNARY_ACOSH,
+    UNARY_INVERSE_SIGMOID,
 } unary_type_e;
 
 /*
@@ -100,10 +104,18 @@ typedef enum
 #define RCP_OPERATION           rcp
 #define SIGN_OPERATION          sign
 #define SOFTSIGN_OPERATION      softsign
+#define ATAN_OPERATION          atan
+#define ATANH_OPERATION         atanh
+#define ACOSH_OPERATION         acosh
+#define INVERSE_SIGMOID_OPERATION inverse_sigmoid
 
-#define ADD_UNARY_SH_KERNELS(name, src_type, dst_type) \
-    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, src_type, dst_type) \
-    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, src_type, dst_type)
+#define ADD_UNARY_SH_KERNELS(name) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F32, F32) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F32, F32) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8,  U8) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8,  U8) \
+    TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8,  F32) \
+    TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8,  F32)
 
 static const struct {
         uint32_t key;
@@ -111,39 +123,28 @@ static const struct {
         const char* source_name;
     } kernel_map[] =
 {
-    ADD_UNARY_SH_KERNELS(SIN,      F32, F32)
-    ADD_UNARY_SH_KERNELS(COS,      F32, F32)
-    ADD_UNARY_SH_KERNELS(EXP,      F32, F32)
-    ADD_UNARY_SH_KERNELS(LOG,      F32, F32)
-    ADD_UNARY_SH_KERNELS(NEG,      F32, F32)
-    ADD_UNARY_SH_KERNELS(HSIGMOID, F32, F32)
-    ADD_UNARY_SH_KERNELS(MISH,     F32, F32)
-    ADD_UNARY_SH_KERNELS(ROUND,    F32, F32)
-    ADD_UNARY_SH_KERNELS(GELU,     F32, F32)
-    ADD_UNARY_SH_KERNELS(HGELU,    F32, F32)
-    ADD_UNARY_SH_KERNELS(SELU,     F32, F32)
-    ADD_UNARY_SH_KERNELS(CELU,     F32, F32)
-    ADD_UNARY_SH_KERNELS(RCP,      F32, F32)
-    ADD_UNARY_SH_KERNELS(SIGN,     F32, F32)
-    ADD_UNARY_SH_KERNELS(SOFTSIGN, F32, F32)
+    ADD_UNARY_SH_KERNELS(SIN)
+    ADD_UNARY_SH_KERNELS(COS)
+    ADD_UNARY_SH_KERNELS(EXP)
+    ADD_UNARY_SH_KERNELS(LOG)
+    ADD_UNARY_SH_KERNELS(NEG)
+    ADD_UNARY_SH_KERNELS(HSIGMOID)
+    ADD_UNARY_SH_KERNELS(MISH)
+    ADD_UNARY_SH_KERNELS(ROUND)
+    ADD_UNARY_SH_KERNELS(GELU)
+    ADD_UNARY_SH_KERNELS(HGELU)
+    ADD_UNARY_SH_KERNELS(SELU)
+    ADD_UNARY_SH_KERNELS(CELU)
+    ADD_UNARY_SH_KERNELS(RCP)
+    ADD_UNARY_SH_KERNELS(SIGN)
+    ADD_UNARY_SH_KERNELS(SOFTSIGN)
+    ADD_UNARY_SH_KERNELS(ATAN)
+    ADD_UNARY_SH_KERNELS(ATANH)
+    ADD_UNARY_SH_KERNELS(ACOSH)
+    ADD_UNARY_SH_KERNELS(INVERSE_SIGMOID)
 
-    ADD_UNARY_SH_KERNELS(SIN,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(COS,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(EXP,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(LOG,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(NEG,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(HSIGMOID, U8,  U8)
-    ADD_UNARY_SH_KERNELS(MISH,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(ROUND,    U8,  U8)
-    ADD_UNARY_SH_KERNELS(GELU,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(HGELU,    U8,  U8)
-    ADD_UNARY_SH_KERNELS(SELU,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(CELU,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(RCP,      U8,  U8)
-    ADD_UNARY_SH_KERNELS(SIGN,     U8,  U8)
-    ADD_UNARY_SH_KERNELS(SOFTSIGN, U8,  U8)
-
-    ADD_UNARY_SH_KERNELS(NEG,      I32, I32)
+    TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32)
+    TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32)
 };
 
 #undef SIN_OPERATION
@@ -161,6 +162,10 @@ static const struct {
 #undef RCP_OPERATION
 #undef SIGN_OPERATION
 #undef SOFTSIGN_OPERATION
+#undef ATAN_OPERATION
+#undef ATANH_OPERATION
+#undef ACOSH_OPERATION
+#undef INVERSE_SIGMOID_OPERATION
 /*
  * Kernel params
  */
@@ -262,6 +267,10 @@ static vsi_status _query_kernel
     case _PACK_SELECT_KEY(F16, F16):
         key = HASH_UNARY_KEY( type, F32, F32, image_2d );
         break;
+    case _PACK_SELECT_KEY(U8, F32):
+    case _PACK_SELECT_KEY(U8, F16):
+        key = HASH_UNARY_KEY( type, U8, F32, image_2d );
+        break;
     default:
         key = HASH_UNARY_KEY( type, input_dtype, output_dtype, image_2d );
         break;
@@ -330,7 +339,7 @@ static vsi_nn_kernel_node_t _setup
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             shape, &new_rank );
-    if( ret )
+    if ( ret )
     {
         rs_tensors[0] = vsi_nn_reshape_tensor( graph,
                 inputs[0], shape, new_rank );
@@ -338,7 +347,7 @@ static vsi_nn_kernel_node_t _setup
                 outputs[0], shape, new_rank );
     }
 
-    if( !vsi_nn_kernel_gpu_check_shape( rs_tensors[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( rs_tensors[0]->attr.size,
                 rs_tensors[0]->attr.dim_num ) )
     {
         return NULL;
@@ -348,11 +357,11 @@ static vsi_nn_kernel_node_t _setup
 
     image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1);
     status = _query_kernel( rs_tensors, &rs_tensors[1], unary_type, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
 
-        if( node )
+        if ( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                     rs_tensors, 1, &rs_tensors[1], 1 );
@@ -452,5 +461,9 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( celu,         UNARY_CELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp,          UNARY_RCP )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( sign,         UNARY_SIGN )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( softsign,     UNARY_SOFTSIGN )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( atan,         UNARY_ATAN )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( atanh,        UNARY_ATANH )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( acosh,        UNARY_ACOSH )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
index 66eb842..bafe86c 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@@ -49,6 +49,7 @@ typedef enum
 
 #define _GATHER_KERNEL_SOURCE           "gather"
 #define _GATHER_BATCH_KERNEL_SOURCE     "gather_batch"
+#define _GATHER_ARRAY_KERNEL_SOURCE     "gather_array"
 
 // Add kernel hashtable here
 #define VX_KERNEL_NAME_GATHER_U8TOU8       CVIVANTE_NAMESPACE("cl.gather_U8toU8")
@@ -61,9 +62,14 @@ typedef enum
 #define VX_KERNEL_NAME_GATHER_BATCH_I32TOI32     CVIVANTE_NAMESPACE("cl.gather_batch_I32toI32")
 #define VX_KERNEL_NAME_GATHER_BATCH_F32TOF32     CVIVANTE_NAMESPACE("cl.gather_batch_F32toF32")
 
+#define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8       CVIVANTE_NAMESPACE("cl.gather_array_U8toU8")
+#define VX_KERNEL_NAME_GATHER_ARRAY_F16TOF16     CVIVANTE_NAMESPACE("cl.gather_array_F16toF16")
+#define VX_KERNEL_NAME_GATHER_ARRAY_I32TOI32     CVIVANTE_NAMESPACE("cl.gather_array_I32toI32")
+#define VX_KERNEL_NAME_GATHER_ARRAY_F32TOF32     CVIVANTE_NAMESPACE("cl.gather_array_F32toF32")
+
 // Add kernel hashtable here
-#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d, _batch) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d << 4) | (_batch))
+#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_array, _batch) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_array << 4) | (_batch))
 
 #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
     { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
@@ -75,6 +81,11 @@ typedef enum
         VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
+#define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \
+        VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -89,6 +100,10 @@ static const struct {
     TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16, _GATHER_BATCH_KERNEL_SOURCE)
     TENSOR_GATHER_BATCH_KERNELS(I32, I32, I32, _GATHER_BATCH_KERNEL_SOURCE)
     TENSOR_GATHER_BATCH_KERNELS(F32, I32, F32, _GATHER_BATCH_KERNEL_SOURCE)
+    TENSOR_GATHER_ARRAY_KERNELS(U8,  I32, U8,  _GATHER_ARRAY_KERNEL_SOURCE)
+    TENSOR_GATHER_ARRAY_KERNELS(F16, I32, F16, _GATHER_ARRAY_KERNEL_SOURCE)
+    TENSOR_GATHER_ARRAY_KERNELS(I32, I32, I32, _GATHER_ARRAY_KERNEL_SOURCE)
+    TENSOR_GATHER_ARRAY_KERNELS(F32, I32, F32, _GATHER_ARRAY_KERNEL_SOURCE)
 };
 
 /*
@@ -114,7 +129,8 @@ static vsi_status cal_gather_tensor_reshape_size
     vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
     uint32_t block_size,
     vsi_size_t batch_dims,
-    uint32_t idxFlg
+    uint32_t idxFlg,
+    int32_t* arrayFlg
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -148,18 +164,19 @@ static vsi_status cal_gather_tensor_reshape_size
     }
     else
     {
-        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        sizes[0] = block_size;
+        sizes[1] = elementCnt / block_size;
+        sizes[2] = outerCnt;
+        if ((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH)
         {
-            sizes[0] = block_size;
-            sizes[1] = elementCnt / block_size;
-            sizes[2] = outerCnt;
-            status = VSI_SUCCESS;
+            arrayFlg[0] |= 1;
         }
+        status = VSI_SUCCESS;
     }
 #undef VSI_NN_MAX_IMAGE_WIDTH
 
     return status;
-} /* _get_EltOP_tensor_reshape_size */
+} /* cal_gather_tensor_reshape_size */
 
 /*
  * Kernel initializer
@@ -209,8 +226,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
 
-    gpu_param.global_size[0]   = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1)
-                                        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[0]   = block_size;
     gpu_param.global_size[1]   = indices_num;
     gpu_param.global_size[2]   = block_num;
 
@@ -239,7 +255,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    int32_t is_batch
+    int32_t is_batch,
+    int32_t is_array
     /* Add extra params */
     )
 {
@@ -262,7 +279,7 @@ static vsi_status _query_kernel
         output_dtype = I32;
     }
 
-    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0, is_batch );
+    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, is_array, is_batch );
 
     for ( i = 0; i < _cnt_of_array(gather_map); i ++ )
     {
@@ -314,11 +331,12 @@ static vsi_nn_kernel_node_t _setup
     int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" );
     int32_t is_batch    = batch_dims > 0 ? 1 : 0;
     vsi_size_t rs_dim   = batch_dims == 0 ? 2 : 3;
+    int32_t is_array    = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0;
     int32_t i           = 0;
 
-    status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0);
-    status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1);
-    status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0);
+    status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
+    status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
+    status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);
     if (status != VSI_SUCCESS)
     {
         return NULL;
@@ -337,7 +355,7 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    status = _query_kernel( kernel, inputs, outputs, is_batch );
+    status = _query_kernel( kernel, inputs, outputs, is_batch, is_array );
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
index 74dd993..a41e7ac 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
@@ -43,6 +43,7 @@ __BEGIN_DECLS
  */
 #define KERNEL_SOURCE_1    "gather_nd"
 #define KERNEL_SOURCE_2    "gather_nd_3d"
+#define KERNEL_SOURCE_3    "gather_nd_batch"
 
  typedef enum
 {
@@ -52,17 +53,25 @@ __BEGIN_DECLS
     _3D
 } vsi_nn_kernel_coord_type_e;
 
-#define HASH_GATHER_ND_KEY(_input0_type, _input1_type, _output_type, _coord_dim) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_coord_dim))
+#define HASH_GATHER_ND_KEY(_input0_type, _input1_type, _output_type, _coord_dim, _batch_dims) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_coord_dim << 4) | (_batch_dims))
 
 #define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
     CVIVANTE_NAMESPACE("cl.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
 
 #define TENSOR_GATHER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
-    { HASH_GATHER_ND_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE), \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, 0), \
         HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
         SOURCE },
 
+#define HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
+    CVIVANTE_NAMESPACE("cl.gather_nd_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
+
+#define TENSOR_GATHER_ND_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, 1), \
+        HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -81,6 +90,12 @@ static const struct {
     TENSOR_GATHER_ND_KERNELS(F16, I32, F16, _3D,      KERNEL_SOURCE_2)
     TENSOR_GATHER_ND_KERNELS(I32, I32, I32, _3D,      KERNEL_SOURCE_2)
     TENSOR_GATHER_ND_KERNELS(F32, I32, F32, _3D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_BATCH_KERNELS(U8,  I32, U8,  _1D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(I8,  I32, I8,  _1D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _1D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(I8,  I32, I8,  _2D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_3)
 };
 
 /*
@@ -103,7 +118,8 @@ static vsi_status cal_gather_nd_tensor_reshape_size
     vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
     uint32_t block_size,
     uint32_t coordDim,
-    int32_t* newDim
+    int32_t* newDim,
+    int32_t  batch_dims
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -114,45 +130,63 @@ static vsi_status cal_gather_nd_tensor_reshape_size
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
     {
         elementCnt *= input_size[i];
     }
 
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
     {
         sizes[i] = 1;
     }
 
-    if(coordDim) // input reshape
+    if (coordDim) // input reshape
     {
-        uint32_t offset = dims_num - coordDim + 1;
-        for(i = coordDim-1; i > 0; i--)
-        {
-            sizes[i] = input_size[i + offset - 1];
-        }
-        for(i = 0; i < offset; i++)
-        {
-            sizes[0] *= input_size[i];
-        }
+        uint32_t offset = dims_num - coordDim + 1 - batch_dims;
 
-        newDim[0] = coordDim;
-        if(coordDim == 1)
+        if (batch_dims)
         {
-            newDim[0] = 2;
-            sizes[0] = block_size;
-            sizes[1] = elementCnt / block_size;
+            for (i = 0; i < offset; i++)
+            {
+                sizes[0] *= input_size[i];
+            }
+
+            for (i = 0; i < coordDim; i++)
+            {
+                sizes[i + 1] = input_size[i + offset];
+            }
+
+            newDim[0] = coordDim == 1 ? 2 : 3;
         }
-        else if(coordDim == 4)
+        else
         {
-            newDim[0] = 3;
+            for (i = coordDim-1; i > 0; i--)
+            {
+                sizes[i] = input_size[i + offset - 1];
+            }
+            for (i = 0; i < offset; i++)
+            {
+                sizes[0] *= input_size[i];
+            }
+
+            newDim[0] = coordDim;
+            if (coordDim == 1)
+            {
+                newDim[0] = 2;
+                sizes[0] = block_size;
+                sizes[1] = elementCnt / block_size;
+            }
+            else if (coordDim == 4)
+            {
+                newDim[0] = 3;
+            }
         }
 
         status = VSI_SUCCESS;
     }
     else  // indices&output reshape
     {
-        if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
         {
             sizes[0] = block_size;
             sizes[1] = elementCnt / block_size;
@@ -222,7 +256,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    int32_t coord_dim
+    int32_t coord_dim,
+    int32_t batch_dims
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -234,30 +269,49 @@ static vsi_status _query_kernel
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    if(coord_dim == 1)
+
+    if (input0_dtype == F32)
+    {
+        input0_dtype = F16;
+    }
+    else if (input0_dtype == I32 || input0_dtype == I16)
+    {
+        input0_dtype = I8;
+    }
+
+    if (output_dtype == F32)
+    {
+        output_dtype = F16;
+    }
+    else if (output_dtype == I32 || output_dtype == I16)
+    {
+        output_dtype = I8;
+    }
+
+    if (coord_dim == 1)
     {
         coord_type = _1D;
     }
-    else if(coord_dim == 2)
+    else if (coord_dim == 2)
     {
         coord_type = _2D;
     }
-    else if(coord_dim == 3 || coord_dim == 4)
+    else if (coord_dim == 3 || coord_dim == 4)
     {
         coord_type = _3D;
     }
 
-    key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type );
+    key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_dims );
 
-    for( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
     {
-        if( gather_nd_map[i].key == key )
+        if ( gather_nd_map[i].key == key )
         {
             break;
         }
     }
 
-    if( i < _cnt_of_array(gather_nd_map) )
+    if ( i < _cnt_of_array(gather_nd_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  gather_nd_map[i].function_name );
         kernel->info.parameters = _gather_nd_kernel_param_def;
@@ -289,29 +343,30 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_GATHER_ND_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
     int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
 
-    status = cal_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim);
-    status |= cal_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim);
-    status |= cal_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim);
-    if(status != VSI_SUCCESS)
+    status = cal_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
+    status |= cal_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
+    status |= cal_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
+    if (status != VSI_SUCCESS)
     {
         return NULL;
     }
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    status = _query_kernel( kernel, inputs, outputs, coord_dim );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, inputs, outputs, coord_dim, batch_dims );
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 0;
             /* Pass parameters to node. */
diff --git a/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
new file mode 100644
index 0000000..1e51bd7
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
@@ -0,0 +1,292 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define _GLOBALLPPOOL_KERNEL_SOURCE_NAME      "globallppool"
+
+// Add kernel hashtable here
+#define GLOBALLPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+#define GLOBALLPPOOL_KERNELS( IN_DTYPE, OUT_DTYPE ) \
+        { GLOBALLPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("cl.globallppool_"#IN_DTYPE"to"#OUT_DTYPE), \
+        _GLOBALLPPOOL_KERNEL_SOURCE_NAME }, \
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _globallppool_kernel_map[] =
+{
+    // Register kernel here
+    GLOBALLPPOOL_KERNELS( F32, F32 )
+    GLOBALLPPOOL_KERNELS( F32, U32 )
+    GLOBALLPPOOL_KERNELS( F32, I32 )
+    GLOBALLPPOOL_KERNELS( U32, U32 )
+    GLOBALLPPOOL_KERNELS( U32, F32 )
+    GLOBALLPPOOL_KERNELS( I32, I32 )
+    GLOBALLPPOOL_KERNELS( I32, F32 )
+    GLOBALLPPOOL_KERNELS( BF16, BF16 )
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _globallppool_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _GLOBALLPPOOL_PARAM_NUM  _cnt_of_array( _globallppool_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_globallppool_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output = (vx_tensor)param[1];
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+    output_shape = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_size[0]   = (output_shape->data[2] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _globallppool_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _globallppool_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _globallppool_kernel_map );
+    vx_param_description_t * param_def  = _globallppool_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _globallppool_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
+     (( in_dtype ) | (out_dtype << 8 ))
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+    case _PACK_SELECT_KEY(F32, F16):
+    case _PACK_SELECT_KEY(F16, F32):
+         key = GLOBALLPPOOL_HASH_KEY( F32, F32);
+         break;
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+         key = GLOBALLPPOOL_HASH_KEY( F32, U32);
+         break;
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+         key = GLOBALLPPOOL_HASH_KEY( F32, I32);
+         break;
+    case _PACK_SELECT_KEY(U8, U8):
+         key = GLOBALLPPOOL_HASH_KEY( U32, U32);
+         break;
+    case _PACK_SELECT_KEY(U8, F16):
+    case _PACK_SELECT_KEY(U8, F32):
+         key = GLOBALLPPOOL_HASH_KEY( U32, F32);
+         break;
+    case _PACK_SELECT_KEY(I8, I8):
+    case _PACK_SELECT_KEY(I8, I16):
+    case _PACK_SELECT_KEY(I16, I8):
+    case _PACK_SELECT_KEY(I16, I16):
+         key = GLOBALLPPOOL_HASH_KEY( I32, I32);
+         break;
+    case _PACK_SELECT_KEY(I8, F16):
+    case _PACK_SELECT_KEY(I8, F32):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I16, F32):
+         key = GLOBALLPPOOL_HASH_KEY( I32, F32);
+         break;
+    default:
+         key = GLOBALLPPOOL_HASH_KEY( in_dtype, out_dtype);
+         break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _globallppool_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GLOBALLPPOOL_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t p           = vsi_nn_kernel_param_get_int32(params, "p");
+    int32_t width       = (int32_t)inputs[0]->attr.size[0];
+    int32_t height      = (int32_t)inputs[0]->attr.size[1];
+    float   outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 2;
+            vsi_nn_kernel_node_pack_io( node_params, _GLOBALLPPOOL_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &p );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GLOBALLPPOOL_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( globallppool, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c b/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
new file mode 100644
index 0000000..2626bfe
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/l1norm_cl.c
@@ -0,0 +1,365 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _L1NORM_KERNEL_SOURCE_NAME      "l1norm"
+
+// Add kernel hashtable here
+#define L1NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d, AXIS) \
+        (( IN_DTYPE << 24 ) | ( OUT_DTYPE << 16) | (_image_2d << 8) | (AXIS))
+#define L1NORM_KERNELS( IN_DTYPE, OUT_DTYPE, AXIS ) \
+        { L1NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 , AXIS), \
+        CVIVANTE_NAMESPACE("cl.l1norm_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        _L1NORM_KERNEL_SOURCE_NAME }
+
+#define L1NORM_KERNELS_2D( IN_DTYPE, OUT_DTYPE, AXIS ) \
+        { L1NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, AXIS), \
+        CVIVANTE_NAMESPACE("cl.l1norm_"#IN_DTYPE"to"#OUT_DTYPE"_2D_axis"#AXIS), \
+        _L1NORM_KERNEL_SOURCE_NAME }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _l1norm_kernel_map[] =
+{
+    // Register kernel here
+    L1NORM_KERNELS( U32, U32, 0 ),
+    L1NORM_KERNELS( U32, I32, 0 ),
+    L1NORM_KERNELS( U32, F32, 0 ),
+    L1NORM_KERNELS( I32, I32, 0 ),
+    L1NORM_KERNELS( I32, U32, 0 ),
+    L1NORM_KERNELS( I32, F32, 0 ),
+    L1NORM_KERNELS( F32, F32, 0 ),
+    L1NORM_KERNELS( F32, U32, 0 ),
+    L1NORM_KERNELS( F32, I32, 0 ),
+
+    L1NORM_KERNELS( U32, U32, 1 ),
+    L1NORM_KERNELS( U32, I32, 1 ),
+    L1NORM_KERNELS( U32, F32, 1 ),
+    L1NORM_KERNELS( I32, I32, 1 ),
+    L1NORM_KERNELS( I32, U32, 1 ),
+    L1NORM_KERNELS( I32, F32, 1 ),
+    L1NORM_KERNELS( F32, F32, 1 ),
+    L1NORM_KERNELS( F32, U32, 1 ),
+    L1NORM_KERNELS( F32, I32, 1 ),
+
+    L1NORM_KERNELS( U32, U32, 2 ),
+    L1NORM_KERNELS( U32, I32, 2 ),
+    L1NORM_KERNELS( U32, F32, 2 ),
+    L1NORM_KERNELS( I32, I32, 2 ),
+    L1NORM_KERNELS( I32, U32, 2 ),
+    L1NORM_KERNELS( I32, F32, 2 ),
+    L1NORM_KERNELS( F32, F32, 2 ),
+    L1NORM_KERNELS( F32, U32, 2 ),
+    L1NORM_KERNELS( F32, I32, 2 ),
+
+    L1NORM_KERNELS_2D( U32, U32, 0 ),
+    L1NORM_KERNELS_2D( U32, I32, 0 ),
+    L1NORM_KERNELS_2D( U32, F32, 0 ),
+    L1NORM_KERNELS_2D( I32, I32, 0 ),
+    L1NORM_KERNELS_2D( I32, U32, 0 ),
+    L1NORM_KERNELS_2D( I32, F32, 0 ),
+    L1NORM_KERNELS_2D( F32, F32, 0 ),
+    L1NORM_KERNELS_2D( F32, U32, 0 ),
+    L1NORM_KERNELS_2D( F32, I32, 0 ),
+
+    L1NORM_KERNELS_2D( U32, U32, 1 ),
+    L1NORM_KERNELS_2D( U32, I32, 1 ),
+    L1NORM_KERNELS_2D( U32, F32, 1 ),
+    L1NORM_KERNELS_2D( I32, I32, 1 ),
+    L1NORM_KERNELS_2D( I32, U32, 1 ),
+    L1NORM_KERNELS_2D( I32, F32, 1 ),
+    L1NORM_KERNELS_2D( F32, F32, 1 ),
+    L1NORM_KERNELS_2D( F32, U32, 1 ),
+    L1NORM_KERNELS_2D( F32, I32, 1 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _l1norm_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
+
+    // Add kererl parameters here
+};
+#define _L1NORM_PARAM_NUM  _cnt_of_array( _l1norm_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_l1norm_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output = (vx_tensor)param[1];
+    vx_int32   axis   = 0;
+    vx_int32   dim    = 0;
+    vx_int32   width  = 0;
+    vx_int32   height = 0;
+    vx_int32   depth  = 0;
+
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+
+    dim    = output_shape->size < 3 ? 2 : 3;
+    width  = (vx_int32)output_shape->data[0];
+    height = (vx_int32)output_shape->data[1];
+    depth  = dim < 3 ? 1 : (vx_int32)output_shape->data[2];
+
+    gpu_param.dim = dim;
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    if (axis == 0)
+    {
+        gpu_param.local_size[0]  = 16;
+        gpu_param.local_size[1]  = 1;
+        gpu_param.local_size[2]  = 1;
+        gpu_param.global_size[0] = 16;
+        gpu_param.global_size[1] = height;
+        gpu_param.global_size[2] = depth;
+    }
+    else if (axis == 1)
+    {
+        gpu_param.local_size[0]  = 1;
+        gpu_param.local_size[1]  = 16;
+        gpu_param.local_size[2]  = 1;
+        gpu_param.global_size[0] = width;
+        gpu_param.global_size[1] = 16;
+        gpu_param.global_size[2] = depth;
+    }
+    else
+    {
+        gpu_param.local_size[0]  = 1;
+        gpu_param.local_size[1]  = 1;
+        gpu_param.local_size[2]  = 16;
+
+        gpu_param.global_size[0]   = width;
+        gpu_param.global_size[1]   = height;
+        gpu_param.global_size[2]   = 16;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _l1norm_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d,
+    int32_t axis
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _l1norm_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _l1norm_kernel_map );
+    vx_param_description_t * param_def  = _l1norm_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _l1norm_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    else if (U8 == in_dtype)
+    {
+        in_dtype = U32;
+    }
+    else if (I16 == in_dtype || I8 == in_dtype)
+    {
+        in_dtype = I32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+    else if (U8 == out_dtype)
+    {
+        out_dtype = U32;
+    }
+    else if (I16 == out_dtype || I8 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = L1NORM_HASH_KEY( in_dtype, out_dtype, image_2d, axis);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _l1norm_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_L1NORM_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    int32_t axis      = vsi_nn_kernel_param_get_int32(params, "axis");
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float inputZp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    int32_t axis_size   = (int32_t)outputs[0]->attr.size[axis];
+    outputScale = 1.0f / outputScale;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    image_2d = (outputs[0]->attr.dim_num == 2);
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d, axis );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 2;
+            vsi_nn_kernel_node_pack_io( node_params, _L1NORM_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputZp );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_size );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _L1NORM_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( l1norm, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
index bf63043..bcf4d7a 100644
--- a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c
@@ -35,6 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
 
@@ -212,27 +213,52 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_LOGICAL_NOT_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = FALSE;
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_element_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            shape, &new_rank );
+
+    if ( ret )
     {
-        return NULL;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shape, new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shape, new_rank );
+    }
+    else
+    {
+        goto final;
     }
 
-    image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1);
-    status = _query_kernel( kernel, inputs, outputs, image_2d);
-    if( VSI_SUCCESS == status)
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size,
+                reshape_tensors[1]->attr.dim_num ) )
+    {
+        goto final;
+    }
+
+    image_2d = (reshape_tensors[1]->attr.dim_num == 2 || reshape_tensors[1]->attr.size[2] == 1);
+    status = _query_kernel( kernel, &reshape_tensors[0], &reshape_tensors[1], image_2d);
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_NOT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    &reshape_tensors[0], input_num, &reshape_tensors[1], output_num );
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_NOT_PARAM_NUM );
         }
     }
+
+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
index d21317c..7121aa9 100644
--- a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
 
 __BEGIN_DECLS
 
@@ -228,30 +228,75 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_LOGICAL_OPS_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = FALSE;
     uint32_t ops_type  = vsi_nn_kernel_param_get_int32( params, "ops_type" );
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if ( ret )
     {
-        return NULL;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                inputs[1], shapes[1], new_rank );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[2], new_rank );
+
+#define _swap_tensor(a, b, tmp)  \
+    do { \
+        tmp = a; \
+        a = b; \
+        b = tmp; \
+    } while(0)
+
+        if (shapes[1][3] > shapes[0][3] && new_rank == 4)
+        {
+            vsi_nn_tensor_t* reshape_tmp;
+            _swap_tensor(reshape_tensors[0], reshape_tensors[1], reshape_tmp);
+        }
+
+#undef _swap_tensor
+    }
+    else
+    {
+        goto final;
+    }
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
     }
 
     image_2d = (outputs[0]->attr.dim_num == 2);
 
-    status = _query_kernel( kernel, inputs, outputs, image_2d, (vsi_nn_logical_ops_type_t)ops_type);
+    status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[2],
+                                image_2d, (vsi_nn_logical_ops_type_t)ops_type);
 
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             /* Pass parameters to node. */
             vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_OPS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    reshape_tensors, input_num, &reshape_tensors[2], output_num );
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_OPS_PARAM_NUM );
         }
     }
 
+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[2] );
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
index 35eb757..5ff2a93 100644
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@@ -64,12 +64,12 @@ __BEGIN_DECLS
 
 #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
     { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0), \
-        HASH_MATRIXMUL_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
+        HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
         SOURCE },
 
 #define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
     { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 1), \
-        HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(F32, F32, F32, IMAGE_DIM), \
+        HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
         SOURCE },
 
 #define TENSOR_MATRIXMUL_TRANSB_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
@@ -83,18 +83,32 @@ static const struct {
         const char* source_name;
     } matrixmul_map[] =
 {
-    TENSOR_MATRIXMUL_KERNELS(F16, F16, F16, _2D,           KERNEL_SOURCE_1)
-    TENSOR_MATRIXMUL_KERNELS(F16, F16, F16, _3D,           KERNEL_SOURCE_1)
-    TENSOR_MATRIXMUL_TRANSA_KERNELS(F16, F16, F16, _2D,    KERNEL_SOURCE_2)
-    TENSOR_MATRIXMUL_TRANSA_KERNELS(F16, F16, F16, _3D,    KERNEL_SOURCE_2)
     TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _2D,           KERNEL_SOURCE_1)
     TENSOR_MATRIXMUL_KERNELS(F32, F32, F32, _3D,           KERNEL_SOURCE_1)
     TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _2D,    KERNEL_SOURCE_2)
     TENSOR_MATRIXMUL_TRANSA_KERNELS(F32, F32, F32, _3D,    KERNEL_SOURCE_2)
     TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _2D,    KERNEL_SOURCE_1)
     TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, F32, F32, _3D,    KERNEL_SOURCE_1)
-    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8,  F32, _2D,    KERNEL_SOURCE_1)
-    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8,  F32, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(F32, I8, F32, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _2D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(I8, I8, I8, _3D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _2D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(I8, I8, I8, _3D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(I8, I8, I8, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _2D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(U8, U8, U8, _3D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _2D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, U8, _3D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, U8, _3D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _2D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_KERNELS(U8, U8, F32, _3D,           KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _2D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSA_KERNELS(U8, U8, F32, _3D,    KERNEL_SOURCE_2)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _2D,    KERNEL_SOURCE_1)
+    TENSOR_MATRIXMUL_TRANSB_KERNELS(U8, U8, F32, _3D,    KERNEL_SOURCE_1)
 };
 
 /*
@@ -198,10 +212,44 @@ static vsi_status _query_kernel
         dim_type = _3D;
     }
 
+    if (input0_dtype == I16 || input0_dtype == I32)
+    {
+        input0_dtype = I8;
+    }
+    else if (input0_dtype == F16)
+    {
+        input0_dtype = F32;
+    }
+    else if (input0_dtype == U32)
+    {
+        input0_dtype = U8;
+    }
+
     if (input1_dtype == I16 || input1_dtype == I32)
     {
         input1_dtype = I8;
     }
+    else if (input1_dtype == F16)
+    {
+        input1_dtype = F32;
+    }
+    else if (input1_dtype == U32)
+    {
+        input1_dtype = U8;
+    }
+
+    if (output_dtype == I16 || output_dtype == I32)
+    {
+        output_dtype = I8;
+    }
+    else if (output_dtype == F16)
+    {
+        output_dtype = F32;
+    }
+    else if (output_dtype == U32)
+    {
+        output_dtype = U8;
+    }
 
     key = HASH_MATRIXMUL_KEY( input0_dtype, input1_dtype, output_dtype, dim_type, transa );
 
@@ -260,6 +308,8 @@ static vsi_nn_kernel_node_t _setup
     float    scale_out = vsi_nn_get_tensor_scale(outputs[0]);
     float    zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
 
+    scale_out = 1 / scale_out;
+
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c b/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
new file mode 100644
index 0000000..408164b
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/maxunpool_cl.c
@@ -0,0 +1,330 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _MAXUNPOOL_KERNEL_SOURCE_NAME      "maxunpool"
+
+// Add kernel hashtable here
+#define MAXUNPOOL_HASH_KEY( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE ) \
+        (( IN_DTYPE0 << 16 ) | ( IN_DTYPE1 << 8 ) | ( OUT_DTYPE ))
+#define MAXUNPOOL_KERNELS( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE ) \
+        { MAXUNPOOL_HASH_KEY( IN_DTYPE0, I32, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("cl.maxunpool_"#IN_DTYPE0"to"#OUT_DTYPE), \
+        _MAXUNPOOL_KERNEL_SOURCE_NAME },
+
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _maxunpool_kernel_map[] =
+{
+    // Register kernel here
+    MAXUNPOOL_KERNELS( F32, I32, F32)
+    MAXUNPOOL_KERNELS( F32, I32, U32)
+    MAXUNPOOL_KERNELS( F32, I32, I32)
+    MAXUNPOOL_KERNELS( U32, I32, U32)
+    MAXUNPOOL_KERNELS( U32, I32, F32)
+    MAXUNPOOL_KERNELS( I32, I32, I32)
+    MAXUNPOOL_KERNELS( I32, I32, F32)
+    MAXUNPOOL_KERNELS( BF16, I32, BF16)
+};
+
+
+/*
+ * Kernel params
+ */
+
+static vx_param_description_t _maxunpool_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _MAXUNPOOL_PARAM_NUM  _cnt_of_array( _maxunpool_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_maxunpool_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output = (vx_tensor)param[2];
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = (output_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0];
+    gpu_param.global_size[1]   = (output_shape->data[1] +  gpu_param.global_scale[1] - 1)
+                                        /  gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (output_shape->data[2] +  gpu_param.global_scale[2] - 1)
+                                        /  gpu_param.global_scale[2];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _maxunpool_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _maxunpool_kernel_map;
+    vx_kernel_initialize_f initializer = _maxunpool_initializer;
+    vx_param_description_t * param_def = _maxunpool_kernel_param_def;
+    size_t kernel_map_size = _cnt_of_array( _maxunpool_kernel_map );
+    size_t param_size = _cnt_of_array( _maxunpool_kernel_param_def );
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
+     (( in_dtype ) | (out_dtype << 8 ))
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+    case _PACK_SELECT_KEY(F32, F16):
+    case _PACK_SELECT_KEY(F16, F32):
+         key = MAXUNPOOL_HASH_KEY( F32, I32, F32);
+         break;
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+         key = MAXUNPOOL_HASH_KEY( F32, I32, U32);
+         break;
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+         key = MAXUNPOOL_HASH_KEY( F32, I32, I32);
+         break;
+    case _PACK_SELECT_KEY(U8, U8):
+         key = MAXUNPOOL_HASH_KEY( U32, I32, U32);
+         break;
+    case _PACK_SELECT_KEY(U8, F16):
+    case _PACK_SELECT_KEY(U8, F32):
+         key = MAXUNPOOL_HASH_KEY( U32, I32, F32);
+         break;
+    case _PACK_SELECT_KEY(I8, I8):
+    case _PACK_SELECT_KEY(I8, I16):
+    case _PACK_SELECT_KEY(I16, I8):
+    case _PACK_SELECT_KEY(I16, I16):
+         key = MAXUNPOOL_HASH_KEY( I32, I32, I32);
+         break;
+    case _PACK_SELECT_KEY(I8, F16):
+    case _PACK_SELECT_KEY(I8, F32):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I16, F32):
+         key = MAXUNPOOL_HASH_KEY( I32, I32, F32);
+         break;
+    default:
+         key = MAXUNPOOL_HASH_KEY( in_dtype, I32, out_dtype);
+         break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_MAXUNPOOL_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t pad_left   = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_right  = vsi_nn_kernel_param_get_int32(params, "pad_right");
+    int32_t pad_top    = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
+    int32_t width_in   = (int32_t)inputs[0]->attr.size[0];
+    int32_t height_in  = (int32_t)inputs[0]->attr.size[1];
+    int32_t width      = (int32_t)outputs[0]->attr.size[0];
+    int32_t height     = (int32_t)outputs[0]->attr.size[1];
+    int32_t batch      = (int32_t)outputs[0]->attr.size[2];
+    int32_t width_nopad  = width - pad_left - pad_right;
+    int32_t height_nopad = height - pad_top - pad_bottom;
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
+
+    status = _query_kernel( kernel, inputs, outputs );
+
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 3;
+            vsi_nn_kernel_node_pack_io( node_params, _MAXUNPOOL_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width_nopad );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height_nopad );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width_in );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height_in );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MAXUNPOOL_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( maxunpool, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
index 56c0097..1d1020d 100644
--- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
@@ -81,9 +81,11 @@ static const struct {
 {
     TENSOR_POW_KERNELS_FLOAT(F32, F32, F32,  KERNEL_SOURCE_1)
     TENSOR_POW_KERNELS_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
+    TENSOR_POW_KERNELS(U32, F32, U32, KERNEL_SOURCE_1)
 
     TENSOR_POW_KERNELS_2D_FLOAT(F32, F32, F32, KERNEL_SOURCE_1)
     TENSOR_POW_KERNELS_2D_FLOAT(F16, F16, F16, KERNEL_SOURCE_1)
+    TENSOR_POW_KERNELS_2D(U32, F32, U32, KERNEL_SOURCE_1)
 };
 
 /*
@@ -94,6 +96,10 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
 #define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
@@ -179,7 +185,25 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    key = HASH_POW_KEY( input0_dtype, input1_dtype, output_dtype, image_2d );
+
+#define _PACK_SELECT_KEY( input0_dtype, input1_dtype, output_dtype) \
+    ((input0_dtype) | (input1_dtype << 8) | (output_dtype << 16))
+    switch(_PACK_SELECT_KEY(input0_dtype, input1_dtype, output_dtype))
+    {
+    case _PACK_SELECT_KEY(F16, F16, F16):
+    case _PACK_SELECT_KEY(F32, F32, F32):
+        key = HASH_POW_KEY( F32, F32, F32, image_2d );
+        break;
+    case _PACK_SELECT_KEY(U8, F16, U8):
+    case _PACK_SELECT_KEY(U8, F32, U8):
+    case _PACK_SELECT_KEY(U32, F16, U32):
+    case _PACK_SELECT_KEY(U32, F32, U32):
+        key = HASH_POW_KEY( U32, F32, U32, image_2d );
+        break;
+    default:
+        key = HASH_POW_KEY( input0_dtype, input1_dtype, output_dtype, image_2d );
+        break;
+    }
 
     for( i = 0; i < _cnt_of_array(pow_map); i ++ )
     {
@@ -219,6 +243,13 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
 
     if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
@@ -234,11 +265,20 @@ static vsi_nn_kernel_node_t _setup
 
         if( node )
         {
+            uint32_t index = 3;
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                     inputs, 2, outputs, 1 );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
 
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
             VSI_ASSERT( status == VSI_SUCCESS );
         }
     }
diff --git a/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c b/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
new file mode 100644
index 0000000..cb9cdcd
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/reversesequence_cl.c
@@ -0,0 +1,307 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define _REVERSESEQUENCE_KERNEL_SOURCE_NAME       "reversesequence"
+
+// Add kernel hashtable here
+#define REVERSESEQUENCE_HASH_KEY( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE, batch_axis ) \
+        (( IN_DTYPE0 << 24 ) | ( IN_DTYPE1 << 16 ) | ( OUT_DTYPE << 8) | (batch_axis) )
+#define REVERSESEQUENCE_KERNELS( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE, batch_axis ) \
+        { REVERSESEQUENCE_HASH_KEY( IN_DTYPE0, IN_DTYPE1, OUT_DTYPE, batch_axis ), \
+        CVIVANTE_NAMESPACE("cl.reversesequence_"#IN_DTYPE0"to"#OUT_DTYPE#batch_axis), \
+        _REVERSESEQUENCE_KERNEL_SOURCE_NAME },
+
+typedef enum
+{
+    _axis1 = 0,
+    _axis2
+} vsi_nn_kernel_batch_axis_type_e;
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _reversesequence_kernel_map[] =
+{
+    // Register kernel here
+    REVERSESEQUENCE_KERNELS( F32, I32, F32, _axis1)
+    REVERSESEQUENCE_KERNELS( F32, I32, U32, _axis1)
+    REVERSESEQUENCE_KERNELS( F32, I32, I32, _axis1)
+    REVERSESEQUENCE_KERNELS( U32, I32, U32, _axis1)
+    REVERSESEQUENCE_KERNELS( U32, I32, F32, _axis1)
+    REVERSESEQUENCE_KERNELS( I32, I32, I32, _axis1)
+    REVERSESEQUENCE_KERNELS( I32, I32, F32, _axis1)
+    REVERSESEQUENCE_KERNELS( BF16, I32, BF16, _axis1)
+
+    REVERSESEQUENCE_KERNELS( F32, I32, F32, _axis2)
+    REVERSESEQUENCE_KERNELS( F32, I32, U32, _axis2)
+    REVERSESEQUENCE_KERNELS( F32, I32, I32, _axis2)
+    REVERSESEQUENCE_KERNELS( U32, I32, U32, _axis2)
+    REVERSESEQUENCE_KERNELS( U32, I32, F32, _axis2)
+    REVERSESEQUENCE_KERNELS( I32, I32, I32, _axis2)
+    REVERSESEQUENCE_KERNELS( I32, I32, F32, _axis2)
+    REVERSESEQUENCE_KERNELS( BF16, I32, BF16, _axis2)
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _reversesequence_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _REVERSESEQUENCE_PARAM_NUM  _cnt_of_array( _reversesequence_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_reversesequence_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  input = (vx_tensor)param[0];
+    vsi_nn_kernel_tensor_attr_t *input_attr  = NULL;
+    vsi_size_array_t            *input_shape = NULL;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input );
+    CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    input_shape = input_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = (input_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0];
+    gpu_param.global_size[1]   = (input_shape->data[1] +  gpu_param.global_scale[1] - 1)
+                                        /  gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (input_shape->data[2] +  gpu_param.global_scale[2] - 1)
+                                        /  gpu_param.global_scale[2];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (input_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&input_attr);
+    }
+
+    return status;
+} /* _reversesequence_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t batch_axis
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _reversesequence_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _reversesequence_kernel_map );
+    vx_param_description_t * param_def  = _reversesequence_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _reversesequence_initializer;
+    vsi_nn_kernel_batch_axis_type_e axis_type = _axis1;
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (batch_axis == 2)
+    {
+        axis_type = _axis2;
+    }
+
+#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
+    (( in_dtype ) | (out_dtype << 8 ))
+    switch(_PACK_SELECT_KEY( in_dtype, out_dtype ))
+    {
+    case _PACK_SELECT_KEY(F16, F16):
+    case _PACK_SELECT_KEY(F32, F32):
+        key = REVERSESEQUENCE_HASH_KEY( F32, I32, F32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(F16, U8):
+    case _PACK_SELECT_KEY(F32, U8):
+        key = REVERSESEQUENCE_HASH_KEY( F32, I32, U32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(F16, I8):
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+    case _PACK_SELECT_KEY(F32, I16):
+        key = REVERSESEQUENCE_HASH_KEY( F32, I32, I32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(U8, U8):
+        key = REVERSESEQUENCE_HASH_KEY( U32, I32, U32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(U8, F16):
+    case _PACK_SELECT_KEY(U8, F32):
+        key = REVERSESEQUENCE_HASH_KEY( U32, I32, F32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(I8, I8):
+    case _PACK_SELECT_KEY(I16, I16):
+        key = REVERSESEQUENCE_HASH_KEY( I32, I32, I32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(I8, F16):
+    case _PACK_SELECT_KEY(I8, F32):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I16, F32):
+        key = REVERSESEQUENCE_HASH_KEY( I32, I32, F32, axis_type);
+        break;
+    case _PACK_SELECT_KEY(BF16, BF16):
+        key = REVERSESEQUENCE_HASH_KEY( BF16, I32, BF16, axis_type);
+        break;
+    default:
+        key = REVERSESEQUENCE_HASH_KEY( in_dtype, I32, out_dtype, axis_type);
+        break;
+    }
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _reversesequence_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_REVERSESEQUENCE_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t batch_axis = vsi_nn_kernel_param_get_int32(params, "batch_axis");
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   inoutScale   = inputScale / outputScale;
+    float   inoutTail    = outputTail - inputTail * inoutScale;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, batch_axis );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 3;
+            vsi_nn_kernel_node_pack_io( node_params, _REVERSESEQUENCE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inoutTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _REVERSESEQUENCE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( reversesequence, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
index d82816c..e897d0f 100644
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@@ -88,6 +88,7 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )
 
@@ -105,8 +106,9 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
 #define SCALAR_SAMPLING_Y_RATIO         (15)
 #define SCALAR_DEPTH                    (16)
 #define SCALAR_FORMAT                   (17)
+#define PLATFORM_TYPE                   (18)
 
-#define ROI_ALIGN_PARAM_NUM         18
+#define ROI_ALIGN_PARAM_NUM         19
 #define ROI_ALIGN_QUANT_PARAM_NUM   _cnt_of_array( _roi_align_kernel_param_def )
 
 /*
@@ -250,6 +252,7 @@ static vsi_nn_kernel_node_t _setup
     float   height_ratio        = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
     int32_t width_sample_num    = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
     int32_t height_sample_num   = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
+    int32_t platform_type       = vsi_nn_kernel_param_get_int32( params, "platform_type" );
     float   input_zp    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     float   input_scale = vsi_nn_get_tensor_scale(inputs[0]);
     float   input_tail  = -(input_zp * input_scale);
@@ -318,6 +321,7 @@ static vsi_nn_kernel_node_t _setup
             node_params[SCALAR_SAMPLING_Y_RATIO]     = vsi_nn_kernel_scalar_create( graph, F32, &sampling_y_ratio );
             node_params[SCALAR_DEPTH]                = vsi_nn_kernel_scalar_create( graph, I32, &depth );
             node_params[SCALAR_FORMAT]               = vsi_nn_kernel_scalar_create( graph, I32, &dtype );
+            node_params[PLATFORM_TYPE]               = vsi_nn_kernel_scalar_create( graph, I32, &platform_type );
 
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
@@ -336,6 +340,7 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_FORMAT] );
+            vsi_nn_kernel_scalar_release( &node_params[PLATFORM_TYPE] );
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
index 5ec59b1..d409c4c 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c
@@ -110,7 +110,7 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
     uint32_t i = 0;
     vsi_size_t elementCnt = 1;
 
-    if(coordDim != 0 && (width == NULL || area == NULL))
+    if (coordDim != 0 && (width == NULL || area == NULL))
     {
         return status;
     }
@@ -118,17 +118,17 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
     {
         elementCnt *= input_size[i];
     }
 
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
     {
         sizes[i] = 1;
     }
 
-    if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+    if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
     {
         sizes[0] = block_size;
         sizes[1] = elementCnt / block_size;
@@ -140,17 +140,17 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
         return status;
     }
 
-    if(coordDim == 1) // index shape
+    if (coordDim == 1) // index shape
     {
         *width = 0;
         *area = 0;
     }
-    else if(coordDim == 2)
+    else if (coordDim == 2)
     {
         *width = input_size[dims_num - 2];
         *area = 0;
     }
-    else if(coordDim == 3)
+    else if (coordDim == 3)
     {
         *width = input_size[dims_num - 3];
         *area = input_size[dims_num - 3] * input_size[dims_num - 2];
@@ -226,30 +226,33 @@ static vsi_status _query_kernel
 
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    if(coord_dim == 1)
+    if (coord_dim == 1)
     {
         coord_type = _1D;
     }
-    else if(coord_dim == 2)
+    else if (coord_dim == 2)
     {
         coord_type = _2D;
     }
-    else if(coord_dim == 3)
+    else if (coord_dim == 3)
     {
         coord_type = _3D;
     }
 
+    input1_dtype = input1_dtype == F16 ? F32 : input1_dtype;
+    output_dtype = output_dtype == F16 ? F32 : output_dtype;
+
     key = HASH_SCATTER_ND_KEY( I32, input1_dtype, output_dtype, coord_type );
 
-    for( i = 0; i < _cnt_of_array(scatter_nd_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_map); i ++ )
     {
-        if( scatter_nd_map[i].key == key )
+        if ( scatter_nd_map[i].key == key )
         {
             break;
         }
     }
 
-    if( i < _cnt_of_array(scatter_nd_map) )
+    if ( i < _cnt_of_array(scatter_nd_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_map[i].function_name );
         kernel->info.parameters = _scatter_nd_kernel_param_def;
@@ -287,26 +290,31 @@ static vsi_nn_kernel_node_t _setup
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
     vsi_size_t width = 0, area = 0;
 
-    status = cal_scatter_nd_tensor_reshape_size(&inputs[0], shapes[0], coord_dim, 0, NULL, NULL, &rs_in_dim);
-    status |= cal_scatter_nd_tensor_reshape_size(&inputs[1], shapes[1], block_size, 0, NULL, NULL, &rs_idx_dim);
-    status |= cal_scatter_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
-                                            &width, &area, &rs_out_dim);
-    if(status != VSI_SUCCESS)
+    if (coord_dim > 3)
     {
         return NULL;
     }
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    status = cal_scatter_nd_tensor_reshape_size(&inputs[0], shapes[0], coord_dim, 0, NULL, NULL, &rs_in_dim);
+    status |= cal_scatter_nd_tensor_reshape_size(&inputs[1], shapes[1], block_size, 0, NULL, NULL, &rs_idx_dim);
+    status |= cal_scatter_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
+                                            &width, &area, &rs_out_dim);
+    if (status != VSI_SUCCESS)
+    {
+        return NULL;
+    }
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
     status = _query_kernel( kernel, inputs, outputs, coord_dim );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 0;
             /* Pass parameters to node. */
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
index fd72a9d..d5f2867 100644
--- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c
@@ -111,12 +111,12 @@ static vsi_status cal_scatter_nd_update_tensor_reshape_size
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
     {
         elementCnt *= input_size[i];
     }
 
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
     {
         sizes[i] = 1;
     }
@@ -235,7 +235,7 @@ static vsi_status _query_kernel
 
     key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0 );
 
-    for( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
     {
         if ( scatter_nd_update_map[i].key == key )
         {
@@ -281,6 +281,13 @@ static vsi_nn_kernel_node_t _setup
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
     vsi_size_t width = 0, area = 0, vol = 0;
     int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
+    vsi_size_t *input_size = inputs[2]->attr.size;
+    uint32_t dims_num = inputs[2]->attr.dim_num;
+
+    if (coord_dim > 4 && input_size[dims_num - 1] > 1)
+    {
+        return NULL;
+    }
 
     status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0],
                     coord_dim, 0, NULL, NULL, NULL, &rs_in_dim);
diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
index 4c3f206..b616a84 100644
--- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
@@ -113,6 +113,8 @@ static const _kernel_map_type _swish_kernel_map[] =
     SWISH_PACK_KERNEL_MAP_2D(U8,  U8),
     SWISH_PACK_KERNEL_MAP(I32,  I32),
     SWISH_PACK_KERNEL_MAP_2D(I32,  I32),
+    SWISH_PACK_KERNEL_MAP(F32,  U8),
+    SWISH_PACK_KERNEL_MAP_2D(F32,  U8),
     HSWISH_PACK_KERNEL_FLOAT_MAP(F32,  F32),
     HSWISH_PACK_KERNEL_FLOAT_MAP_2D(F32,  F32),
     HSWISH_PACK_KERNEL_FLOAT_MAP(F16,  F16),
@@ -222,6 +224,11 @@ static vsi_status _query_kernel
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    if (in_dtype == F16)
+        in_dtype = F32;
+    if (out_dtype == F16)
+        out_dtype = F32;
+
     key = SWISH_HASH_KEY(swish_type, in_dtype, out_dtype, image_2d);
 
     for( i = 0; i < kernel_map_size; i ++ )
diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
index dab13f7..6381694 100644
--- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
@@ -279,7 +279,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t  new_rank = 0;
     vsi_bool ret = FALSE;
     uint32_t dim = inputs[0]->attr.dim_num;
-    vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
 
     for ( i = 0;  i < dim;  i++)
     {
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
index a3d5428..0354a1e 100644
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -55,6 +55,13 @@ __BEGIN_DECLS
           CVIVANTE_NAMESPACE("cl.topk_odd_even_sort_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
           "topk_odd_even_sort" }
 
+#define TOPK_ODD_EVEN_SORT_HASH_KEY2( IN_DTYPE, OUT_DTYPE ) \
+       ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
+#define PACK_ODD_EVEN_SORT_KERNEL_MAP2( IN_DTYPE, OUT_DTYPE ) \
+       { TOPK_ODD_EVEN_SORT_HASH_KEY2( IN_DTYPE, OUT_DTYPE ), \
+         CVIVANTE_NAMESPACE("cl.topk_odd_even_sort_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+         "topk_odd_even_sort2" }
+
 typedef struct
 {
     uint32_t key;
@@ -88,6 +95,22 @@ static const _kernel_map_type _topk_kernel_map[] =
     PACK_KERNEL_MAP( I32, I32, 4 ),
     PACK_KERNEL_MAP( I32, I32, 5 ),
     PACK_KERNEL_MAP( I32, I32, 6 ),
+
+    PACK_KERNEL_MAP( F32, U32, 0 ),
+    PACK_KERNEL_MAP( F32, U32, 1 ),
+    PACK_KERNEL_MAP( F32, U32, 2 ),
+    PACK_KERNEL_MAP( F32, U32, 3 ),
+    PACK_KERNEL_MAP( F32, U32, 4 ),
+    PACK_KERNEL_MAP( F32, U32, 5 ),
+    PACK_KERNEL_MAP( F32, U32, 6 ),
+
+    PACK_KERNEL_MAP( F32, I32, 0 ),
+    PACK_KERNEL_MAP( F32, I32, 1 ),
+    PACK_KERNEL_MAP( F32, I32, 2 ),
+    PACK_KERNEL_MAP( F32, I32, 3 ),
+    PACK_KERNEL_MAP( F32, I32, 4 ),
+    PACK_KERNEL_MAP( F32, I32, 5 ),
+    PACK_KERNEL_MAP( F32, I32, 6 ),
 };
 
 static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@@ -96,6 +119,8 @@ static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
     PACK_ODD_EVEN_SORT_KERNEL_MAP( F32, F32 ),
     PACK_ODD_EVEN_SORT_KERNEL_MAP( U32, U32 ),
     PACK_ODD_EVEN_SORT_KERNEL_MAP( I32, I32 ),
+    PACK_ODD_EVEN_SORT_KERNEL_MAP2( F32, U32 ),
+    PACK_ODD_EVEN_SORT_KERNEL_MAP2( F32, I32 ),
 };
 
 /*
@@ -108,11 +133,15 @@ static vx_param_description_t _topk_kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 #define _TOPK_PARAM_NUM  _cnt_of_array( _topk_kernel_param_def )
-#define SCALAR_INPUT_NUM_STAGES (3)
-#define SCALAR_INPUT_WIDTH      (4)
+#define SCALAR_INPUT_NUM_STAGES (7)
+#define SCALAR_INPUT_WIDTH      (8)
 
 static vx_param_description_t _topk_odd_even_sort_kernel_param_def[] =
 {
@@ -122,10 +151,14 @@ static vx_param_description_t _topk_odd_even_sort_kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 #define _TOPK_ODD_EVEN_SORT_PARAM_NUM  _cnt_of_array( _topk_odd_even_sort_kernel_param_def )
-#define SCALAR_INPUT_SIZE  (5)
+#define SCALAR_INPUT_SIZE  (9)
 /*
  * Kernel initializer
  */
@@ -251,6 +284,22 @@ static vsi_status _query_kernel
     case _PACK_SELECT_KEY(I8,  I8):
         key = TOPK_HASH_KEY( I32, I32, num_stages );
         break;
+    case _PACK_SELECT_KEY(F32, U32):
+    case _PACK_SELECT_KEY(F16, U32):
+    case _PACK_SELECT_KEY(F32, U16):
+    case _PACK_SELECT_KEY(F16, U16):
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+        key = TOPK_HASH_KEY( F32, U32, num_stages );
+        break;
+    case _PACK_SELECT_KEY(F32, I32):
+    case _PACK_SELECT_KEY(F16, I32):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I16):
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F16, I8):
+        key = TOPK_HASH_KEY( F32, I32, num_stages );
+        break;
     default:
         break;
     }
@@ -318,6 +367,22 @@ static vsi_status _query_odd_even_sort_kernel
     case _PACK_SELECT_KEY(I8,  I8):
         key = TOPK_ODD_EVEN_SORT_HASH_KEY( I32, I32 );
         break;
+    case _PACK_SELECT_KEY(F32, U32):
+    case _PACK_SELECT_KEY(F16, U32):
+    case _PACK_SELECT_KEY(F32, U16):
+    case _PACK_SELECT_KEY(F16, U16):
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY2( F32, U32 );
+        break;
+    case _PACK_SELECT_KEY(F32, I32):
+    case _PACK_SELECT_KEY(F16, I32):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I16):
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F16, I8):
+        key = TOPK_ODD_EVEN_SORT_HASH_KEY2( F32, I32 );
+        break;
     default:
         break;
     }
@@ -372,14 +437,24 @@ static vsi_nn_kernel_node_t _setup
     int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
     vsi_bool is_odd_even_sort = FALSE;
     size_t param_num = _TOPK_PARAM_NUM;
+    float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
 
     for (i = 1; i < inputs[0]->attr.dim_num; i ++)
     {
         block_num = block_num * inputs[0]->attr.size[i];
     }
 
-    if( vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE ||
-        outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_INT32 )
+    if ((vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE ||
+       outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_INT32 ) &&
+      !(inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 &&
+        (outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 ||
+        outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16)))
     {
         return NULL;
     }
@@ -425,10 +500,15 @@ static vsi_nn_kernel_node_t _setup
         node = vsi_nn_kernel_create_node( graph, kernel );
         if ( node )
         {
-            /* Set inputs and outputs */
+            uint32_t index = (uint32_t)(input_num + output_num);
+            /* Set inputs and outputs  */
             vsi_nn_kernel_node_pack_io( node_params, param_num,
                     rs_tensors, input_num, &rs_tensors[input_num], output_num );
             /* Pass parameters to node. */
+            node_params[index++]  = vsi_nn_kernel_scalar_create(graph, I32, &inputScale );
+            node_params[index++]   = vsi_nn_kernel_scalar_create(graph, I32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create(graph, I32, &outputScale );
+            node_params[index++]  = vsi_nn_kernel_scalar_create(graph, I32, &outputTail );
             if (is_odd_even_sort)
             {
                 node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create(
@@ -452,8 +532,25 @@ final:
     vsi_safe_release_tensor(rs_tensors[2]);
     vsi_safe_release_tensor(rs_tensors[3]);
     vsi_safe_release_tensor(rs_tensors[4]);
+
     if (is_odd_even_sort)
     {
+        if (node_params[5])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+        if (node_params[6])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+        }
+        if (node_params[7])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+        }
+        if (node_params[8])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+        }
         if (node_params[SCALAR_INPUT_SIZE])
         {
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SIZE] );
@@ -461,6 +558,22 @@ final:
     }
     else
     {
+        if (node_params[3])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+        }
+        if (node_params[4])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+        }
+        if (node_params[5])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+        if (node_params[6])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+        }
         if (node_params[SCALAR_INPUT_NUM_STAGES])
         {
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
diff --git a/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c b/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c
deleted file mode 100644
index f4b6eee..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c
+++ /dev/null
@@ -1,243 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _CPU_IO_NUM         (_INPUT_NUM + _OUTPUT_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.add_mean_std_norm")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _add_mean_std_norm_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _ADD_MEAN_STD_NORM_PARAM_NUM  _cnt_of_array( _add_mean_std_norm_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    int32_t  i;
-    float mean = .0f, stddev_inv = .0f, variance = .0f, input_d = .0f, data = .0f, eps = .0f;
-    vsi_ssize_t v_size, n_batch, batch;
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[_CPU_IO_NUM], &(eps));
-    v_size  = in_attr[0]->shape->data[0];
-    n_batch = in_attr[0]->shape->data[1];
-
-    for (batch = 0; batch < n_batch; ++batch)
-    {
-        float   sum         = 0.0f;
-        float   sum_sq      = 0.0f;
-        vsi_ssize_t index_base  = batch * v_size;
-        for (i = 0; i < v_size; ++i)
-        {
-            vsi_ssize_t index = i + index_base;
-            input_d = f32_in_buffer[0][index] + f32_in_buffer[1][index];
-            sum    += input_d;
-            sum_sq += input_d * input_d;
-        }
-
-        mean = sum / v_size;
-        stddev_inv = 0.0f;
-        variance = sum_sq / v_size - mean * mean;
-
-        if (variance == 0)
-        {
-            stddev_inv = (float)(1.0f / sqrt(eps));
-        }
-        else
-        {
-            stddev_inv = (float)(1.0f / sqrt(variance));
-        }
-
-        for (i = 0; i < v_size; ++i)
-        {
-            vsi_ssize_t index = i + index_base;
-            input_d   = f32_in_buffer[0][index] + f32_in_buffer[1][index];
-            data      = (input_d - mean) * stddev_inv;
-            f32_out_buffer[0][index] = data;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _add_mean_std_norm_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _add_mean_std_norm_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_ADD_MEAN_STD_NORM_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _ADD_MEAN_STD_NORM_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[_CPU_IO_NUM] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ADD_MEAN_STD_NORM_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( add_mean_std_norm, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
deleted file mode 100644
index 6bb8eeb..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("argmax_sw")
-
-DEF_KERNEL_EXECUTOR(_argmax_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t i;
-    int32_t axis = 0;
-    vsi_ssize_t outerSize = 1;
-    vsi_ssize_t axisSize = 1;
-    vsi_ssize_t innerSize = 1;
-    vsi_ssize_t inner = 0;
-    vsi_ssize_t outer = 0;
-
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    for (i = 0; i < axis; i++)
-    {
-        innerSize *= attr[0]->shape->data[i];
-    }
-
-    axisSize = attr[0]->shape->data[axis];
-
-    for (i = axis + 1; i < (int32_t)attr[0]->shape->size; i++)
-    {
-        outerSize *= attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            float minMaxValue = buffer[0][outer * axisSize * innerSize + inner];
-            int32_t minMaxIndex = 0;
-            for (i = 1; i < axisSize; ++i)
-            {
-                float value = buffer[0][(outer * axisSize + i) * innerSize + inner];
-                if (value > minMaxValue)
-                {
-                    minMaxValue = value;
-                    minMaxIndex = i;
-                }
-            }
-            buffer[1][outer * innerSize + inner] = (float)minMaxIndex;
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _minimum_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _argmax_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-#define SCALAR_INPUT_AXIS          (2)
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis = vsi_nn_kernel_param_get_int32(params, "axis");
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( argmax, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
deleted file mode 100644
index 3c9d6b9..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("argmin_sw")
-
-DEF_KERNEL_EXECUTOR(_argmin_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t i;
-    int32_t axis = 0;
-    vsi_ssize_t outerSize = 1;
-    vsi_ssize_t axisSize = 1;
-    vsi_ssize_t innerSize = 1;
-    vsi_ssize_t inner = 0;
-    vsi_ssize_t outer = 0;
-
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    for (i = 0; i < axis; i++)
-    {
-        innerSize *= attr[0]->shape->data[i];
-    }
-
-    axisSize = attr[0]->shape->data[axis];
-
-    for (i = axis + 1; i < (int32_t)attr[0]->shape->size; i++)
-    {
-        outerSize *= attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            float minMaxValue = buffer[0][outer * axisSize * innerSize + inner];
-            int32_t minMaxIndex = 0;
-            for (i = 1; i < axisSize; ++i)
-            {
-                float value = buffer[0][(outer * axisSize + i) * innerSize + inner];
-                if (value < minMaxValue)
-                {
-                    minMaxValue = value;
-                    minMaxIndex = i;
-                }
-            }
-            buffer[1][outer * innerSize + inner] = (float)minMaxIndex;
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _minimum_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _argmin_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-#define SCALAR_INPUT_AXIS          (2)
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis = vsi_nn_kernel_param_get_int32(params, "axis");
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( argmin, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
deleted file mode 100644
index 9d39e21..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
+++ /dev/null
@@ -1,277 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (4)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.axis_aligned_bbox_transform")
-
-typedef struct vsi_nn_box_encoding_corner_t
-{
-    float x1, y1, x2, y2;
-}vsi_nn_box_encoding_corner;
-
-typedef struct vsi_nn_box_encoding_center_t
-{
-    float w, h, x, y;
-}vsi_nn_box_encoding_center;
-
-/*
- * Kernel params
- */
-static vx_param_description_t _axis_aligned_bbox_transform_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM  _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def )
-
-
-static void _to_box_encoding_corner
-    (
-    vsi_nn_box_encoding_center* ctr,
-    vsi_nn_box_encoding_corner* cnr
-    )
-{
-    cnr->x1 = ctr->x - ctr->w / 2;
-    cnr->y1 = ctr->y - ctr->h / 2;
-    cnr->x2 = ctr->x + ctr->w / 2;
-    cnr->y2 = ctr->y + ctr->h / 2;
-}
-
-static void _to_box_encoding_center
-    (
-    vsi_nn_box_encoding_corner* cnr,
-    vsi_nn_box_encoding_center* ctr
-    )
-{
-    ctr->w = cnr->x2 - cnr->x1;
-    ctr->h = cnr->y2 - cnr->y1;
-    ctr->x = (cnr->x1 + cnr->x2) / 2;
-    ctr->y = (cnr->y1 + cnr->y2) / 2;
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    const uint32_t roiLength = 4;
-    const uint32_t imageLength = 2;
-    vsi_size_t numClasses = 0;
-    vsi_size_t numRois = 0;
-    vsi_size_t j;
-    vsi_size_t roiIndex;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    numClasses = in_attr[1]->shape->data[0] / roiLength;
-    numRois = in_attr[0]->shape->data[1];
-
-    for (roiIndex = 0; roiIndex < numRois; roiIndex++)
-    {
-        uint32_t batchIndex = (uint32_t)f32_in_buffer[2][roiIndex];
-        float imageHeight = f32_in_buffer[3][batchIndex * imageLength];
-        float imageWidth = f32_in_buffer[3][batchIndex * imageLength + 1];
-        vsi_nn_box_encoding_corner roi_cnr;
-        vsi_nn_box_encoding_center roiBefore;
-        roi_cnr.x1 = f32_in_buffer[0][roiIndex * roiLength];
-        roi_cnr.y1 = f32_in_buffer[0][roiIndex * roiLength + 1];
-        roi_cnr.x2 = f32_in_buffer[0][roiIndex * roiLength + 2];
-        roi_cnr.y2 = f32_in_buffer[0][roiIndex * roiLength + 3];
-        _to_box_encoding_center(&roi_cnr, &roiBefore);
-
-        for (j = 0; j < numClasses; j++)
-        {
-            vsi_nn_box_encoding_center roi_ctr;
-            vsi_nn_box_encoding_corner roiAfter;
-            vsi_nn_box_encoding_corner cliped;
-            vsi_size_t index = (roiIndex * numClasses + j) * roiLength;
-
-            roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w);
-            roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h);
-            roi_ctr.x = roiBefore.x + f32_in_buffer[1][index] * roiBefore.w;
-            roi_ctr.y = roiBefore.y + f32_in_buffer[1][index + 1] * roiBefore.h;
-            _to_box_encoding_corner(&roi_ctr, &roiAfter);
-
-            cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth);
-            cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight);
-            cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth);
-            cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight);
-            f32_out_buffer[0][index] = cliped.x1;
-            f32_out_buffer[0][index + 1] = cliped.y1;
-            f32_out_buffer[0][index + 2] = cliped.x2;
-            f32_out_buffer[0][index + 3] = cliped.y2;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _axis_aligned_bbox_transform_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( axis_aligned_bbox_transform, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
deleted file mode 100644
index dcf7940..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_tensor_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (5)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("batch_norm_sw")
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for( i = 0; i < rank && index; i ++ )
-    {
-        if( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-    return offset;
-}
-
-DEF_KERNEL_EXECUTOR(_batch_norm_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    vsi_size_t out_elements = 0;
-    vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    float eps = 0.f;
-
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &eps);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for ( i = 0;  i < _CPU_INPUT_NUM;  i++)
-    {
-        tensors[i]  = (vsi_nn_kernel_tensor_t)param[i];
-        attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
-
-        vsi_nn_kernel_tensor_attr_get_stride( attr[i], stride_size[i] );
-        buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( buffer[i], "Create input buffer fail.", final );
-    }
-
-    tensors[5]  = (vsi_nn_kernel_tensor_t)param[5];
-    attr[5] = vsi_nn_kernel_tensor_attr_create( tensors[5] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[5] );
-
-    buffer[5] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[5], "Create output buffer fail.", final );
-    memset( buffer[5], 0, out_elements * sizeof(float) );
-
-    for( i = 0; i < out_elements; i ++ )
-    {
-        vsi_ssize_t in_offset[5] = {0};
-        int32_t j = 0;
-        float src = 0.f;
-        float mean = 0.f;
-        float variance = 0.f;
-        float beta = 0.f;
-        float gamma = 0.f;
-
-        for ( j = 0; j < 5; j++)
-        {
-            in_offset[j] = _expand_offset( i, attr[j]->shape->data, (vsi_size_t)attr[j]->shape->size,
-                    stride_size[j], attr[5]->shape->data );
-        }
-
-        src = buffer[0][in_offset[0]];
-        mean = buffer[1][in_offset[1]];
-        variance = buffer[2][in_offset[2]];
-        gamma = buffer[3][in_offset[3]];
-        beta = buffer[4][in_offset[4]];
-
-
-        buffer[5][i] = (src - mean) * gamma/ sqrtf(variance + eps) + beta;
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[5], attr[5],
-            buffer[5], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _batch_norm_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-#define SCALAR_INPUT_EPS          (6)
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _batch_norm_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float eps = 0;
-
-    eps = vsi_nn_kernel_param_get_float32(params, "eps");
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            /* Pass parameters to node. */
-            backend_params[SCALAR_INPUT_EPS] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &eps );
-
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_EPS] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( batchnorm_single, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
deleted file mode 100644
index 28a5763..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c
+++ /dev/null
@@ -1,534 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (4)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.box_with_nms_limit")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _box_with_nms_limit_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _BOX_WITH_NMS_LIMIT_PARAM_NUM  _cnt_of_array( _box_with_nms_limit_kernel_param_def )
-#define SCORE_THRESHOLD         (7)
-#define MAX_NUM_DETECTIONS      (8)
-#define NMS_KERNEL_METHOD       (9)
-#define IOU_THRESHOLD           (10)
-#define SIGMA                   (11)
-#define NMS_SCORE_THRESHOLD     (12)
-
-static float hard_nms_kernel
-    (
-    float iou,
-    float iouThreshold
-    )
-{
-    return iou < iouThreshold ? 1.0f : 0.0f;
-}
-
-static float linear_nms_kernel
-    (
-    float iou,
-    float iouThreshold
-    )
-{
-    return iou < iouThreshold ? 1.0f : 1.0f - iou;
-}
-
-static float gaussian_nms_kernel
-    (
-    float iou,
-    float sigma
-    )
-{
-    return (float)(exp(-1.0f * iou * iou / sigma));
-}
-
-void swap_element
-    (
-    uint32_t* list,
-    uint32_t first,
-    uint32_t second
-    )
-{
-    uint32_t temp = list[first];
-    list[first] = list[second];
-    list[second] = temp;
-}
-
-uint32_t max_element
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    )
-{
-    uint32_t i;
-    uint32_t max_index = 0;
-    float max_val = data[index_list[0]];
-    for(i = 1; i < len; i++)
-    {
-        float val = data[index_list[i]];
-        if (max_val < val)
-        {
-            max_val = val;
-            max_index = i;
-        }
-    }
-    return max_index;
-}
-
-static uint32_t max_comp_func
-    (
-    void* data,
-    int32_t left,
-    int32_t right
-    )
-{
-    float* fdata = (float*)data;
-    return fdata[left] >= fdata[right];
-}
-
-void sort_element_by_score
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    )
-{
-    vsi_nn_partition(data, 0, len - 1, max_comp_func, TRUE, index_list);
-}
-
-typedef struct
-{
-    float* fdata;
-    uint32_t numClasses;
-} class_comp_param;
-
-static uint32_t class_comp_func
-    (
-    void* data,
-    int32_t left,
-    int32_t right
-    )
-{
-    class_comp_param *p = (class_comp_param*)data;
-    float* fdata = p->fdata;
-    uint32_t numClasses = p->numClasses;
-    uint32_t lhsClass = left % numClasses, rhsClass = right % numClasses;
-    return lhsClass == rhsClass ? fdata[left] > fdata[right]
-                : lhsClass < rhsClass;
-}
-
-static void sort_element_by_class
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len,
-    uint32_t numClasses
-    )
-{
-    class_comp_param class_comp;
-    class_comp.fdata = data;
-    class_comp.numClasses = numClasses;
-    vsi_nn_partition(&class_comp, 0, len - 1, class_comp_func, TRUE, index_list);
-}
-
-// Taking two indices of bounding boxes, return the intersection-of-union.
-float getIoUAxisAligned
-    (
-    const float* roi1,
-    const float* roi2
-    )
-{
-    const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
-    const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
-    const float x1 = vsi_nn_max(roi1[0], roi2[0]);
-    const float x2 = vsi_nn_min(roi1[2], roi2[2]);
-    const float y1 = vsi_nn_max(roi1[1], roi2[1]);
-    const float y2 = vsi_nn_min(roi1[3], roi2[3]);
-    const float w = vsi_nn_max(x2 - x1, 0.0f);
-    const float h = vsi_nn_max(y2 - y1, 0.0f);
-    const float areaIntersect = w * h;
-    const float areaUnion = area1 + area2 - areaIntersect;
-    return areaIntersect / areaUnion;
-}
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    int32_t* int32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    int32_t* int32_out_buffer[_OUTPUT_NUM] = {0};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i = 0;
-    float score_threshold = 0;
-    int32_t max_num_detections = 0;
-    int32_t nms_kernel_method = 0;
-    float iou_threshold = 0;
-    float sigma = 0;
-    float nms_score_threshold = 0;
-    uint32_t j = 0, n = 0, b = 0, c = 0;
-    const uint32_t kRoiDim = 4;
-    uint32_t numRois = 0;
-    uint32_t numClasses = 0;
-    int32_t ind = 0;
-    uint32_t * batch_data = NULL;
-    int32_t numBatch = 0;
-    uint32_t * select = NULL;
-    uint32_t select_size = 0;
-    uint32_t scores_index = 0;
-    uint32_t roi_index = 0;
-    uint32_t roi_out_index = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        if (i == 2)
-        {
-            int32_in_buffer[i] = (int32_t*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-            CHECK_PTR_FAIL_GOTO( int32_in_buffer[i], "Create input buffer fail.", final );
-        }
-        else
-        {
-            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
-        }
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        if (i < 2)
-        {
-            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-            memset( f32_out_buffer[i], 0, out_bytes[i] );
-        }
-        else
-        {
-            int32_out_buffer[i] = (int32_t *)malloc( out_bytes[i] );
-            CHECK_PTR_FAIL_GOTO( int32_out_buffer[i], "Create output buffer fail.", final );
-            memset( int32_out_buffer[i], 0, out_bytes[i] );
-        }
-    }
-
-#define VSI_NN_KERNEL_READ_SCALAR(type, idx, pointer) \
-    vsi_nn_kernel_scalar_read_##type((vsi_nn_kernel_scalar_t)param[idx], pointer)
-
-    status   = VSI_NN_KERNEL_READ_SCALAR(float32, SCORE_THRESHOLD, &score_threshold);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(int32, MAX_NUM_DETECTIONS, &max_num_detections);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(int32, NMS_KERNEL_METHOD, &nms_kernel_method);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(float32, IOU_THRESHOLD, &iou_threshold);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(float32, SIGMA, &sigma);
-    status  |= VSI_NN_KERNEL_READ_SCALAR(float32, NMS_SCORE_THRESHOLD, &nms_score_threshold);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-#undef VSI_NN_KERNEL_READ_SCALAR
-
-    numRois = (uint32_t)in_attr[0]->shape->data[1];
-    numClasses = (uint32_t)in_attr[0]->shape->data[0];
-
-    batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t));
-    CHECK_PTR_FAIL_GOTO( batch_data, "Create batch_data fail.", final );
-    memset(batch_data, 0, numRois * sizeof(uint32_t));
-
-    for (i = 0, ind = -1; i < numRois; i++)
-    {
-        if (int32_in_buffer[2][i] != ind)
-        {
-            ind = int32_in_buffer[2][i];
-            numBatch++;
-        }
-        batch_data[numBatch - 1]++;
-    }
-    select = (uint32_t*)malloc(numBatch * numRois
-        * numClasses * sizeof(uint32_t));
-    CHECK_PTR_FAIL_GOTO( select, "Create select fail.", final );
-    memset(select, 0, numBatch * numRois * numClasses * sizeof(uint32_t));
-    for (n = 0; n < (uint32_t)numBatch; n++)
-    {
-        int32_t numDetections_batch = 0;
-        uint32_t select_start_batch = select_size;
-        uint32_t select_len = 0;
-        // Exclude class 0 (background)
-        for (c = 1; c < numClasses; c++)
-        {
-            uint32_t select_start = select_size;
-            int32_t maxNumDetections0 = max_num_detections;
-            uint32_t numDetections = 0;
-            for (b = 0; b < batch_data[n]; b++)
-            {
-                uint32_t index = b * numClasses + c;
-                float score = f32_in_buffer[0][scores_index + index];
-                if (score > score_threshold) {
-                    select[select_size] = index;
-                    select_size++;
-                }
-            }
-            select_len = select_size - select_start;
-
-            if (maxNumDetections0 < 0)
-            {
-                maxNumDetections0 = select_len;
-            }
-
-            for (j = 0; (j < select_len && numDetections < (uint32_t)maxNumDetections0); j++)
-            {
-                // find max score and swap to the front.
-                int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]),
-                    &(select[select_start + j]), select_len - j) + j;
-
-                swap_element(&(select[select_start]), max_index, j);
-
-                // Calculate IoU of the rest, swap to the end (disgard) if needed.
-                for (i = j + 1; i < select_len; i++)
-                {
-                    int32_t roiBase0 = roi_index + select[select_start + i] * kRoiDim;
-                    int32_t roiBase1 = roi_index + select[select_start + j] * kRoiDim;
-                    float iou = getIoUAxisAligned(&(f32_in_buffer[1][roiBase0]),
-                        &(f32_in_buffer[1][roiBase1]));
-                    float kernel_iou;
-                    if (nms_kernel_method == 0)
-                    {
-                        kernel_iou = hard_nms_kernel(iou, iou_threshold);
-                    }
-                    else if (nms_kernel_method == 1)
-                    {
-                        kernel_iou = linear_nms_kernel(iou, iou_threshold);
-                    }
-                    else
-                    {
-                        kernel_iou = gaussian_nms_kernel(iou, sigma);
-                    }
-                    f32_in_buffer[0][scores_index + select[select_start + i]] *= kernel_iou;
-                    if (f32_in_buffer[0][scores_index + select[select_start + i]] < nms_score_threshold)
-                    {
-                        swap_element(&(select[select_start]), i, select_len - 1);
-                        i--;
-                        select_len--;
-                    }
-                }
-                numDetections++;
-            }
-            select_size = select_start + select_len;
-            numDetections_batch += numDetections;
-        }
-
-        // Take top max_num_detections.
-        sort_element_by_score(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
-            numDetections_batch);
-
-        if (numDetections_batch > max_num_detections && max_num_detections >= 0)
-        {
-            select_size = select_start_batch + max_num_detections;
-        }
-        select_len = select_size - select_start_batch;
-        // Sort again by class.
-        sort_element_by_class(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
-            select_len, numClasses);
-
-        for (i = 0; i < select_len; i++)
-        {
-            int32_t in_index0 = scores_index + select[select_start_batch + i];
-            int32_t in_index1 = roi_index + select[select_start_batch + i] * kRoiDim;
-            f32_out_buffer[0][roi_out_index] = f32_in_buffer[0][in_index0];
-            memcpy(&(f32_out_buffer[1][roi_out_index * kRoiDim]),
-                &f32_in_buffer[1][in_index1], kRoiDim * sizeof(float));
-            int32_out_buffer[2][roi_out_index] = select[select_start_batch + i] % numClasses;
-            int32_out_buffer[3][roi_out_index] = n;
-            roi_out_index++;
-        }
-
-        scores_index += batch_data[n] * numClasses;
-        roi_index += batch_data[n] * numClasses * kRoiDim;
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (i < 2)
-        {
-            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        }
-        else
-        {
-            status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
-                int32_out_buffer[i], out_bytes[i] );
-        }
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-final:
-    vsi_nn_safe_free(batch_data);
-    vsi_nn_safe_free(select);
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        vsi_nn_safe_free(f32_in_buffer[i]);
-        vsi_nn_safe_free(int32_in_buffer[i]);
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        vsi_nn_safe_free(f32_out_buffer[i]);
-        vsi_nn_safe_free(int32_out_buffer[i]);
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _box_with_nms_limit_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _box_with_nms_limit_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_BOX_WITH_NMS_LIMIT_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float score_threshold  = vsi_nn_kernel_param_get_float32( params, "score_threshold" );
-    int32_t max_num_detections  = vsi_nn_kernel_param_get_int32( params, "max_num_detections" );
-    int32_t nms_kernel_method  = vsi_nn_kernel_param_get_int32( params, "nms_kernel_method" );
-    float iou_threshold  = vsi_nn_kernel_param_get_float32( params, "iou_threshold" );
-    float sigma  = vsi_nn_kernel_param_get_float32( params, "sigma" );
-    float nms_score_threshold  = vsi_nn_kernel_param_get_float32( params, "nms_score_threshold" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status )
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &score_threshold );
-            node_params[MAX_NUM_DETECTIONS] = vsi_nn_kernel_scalar_create( graph, I32, &max_num_detections );
-            node_params[NMS_KERNEL_METHOD] = vsi_nn_kernel_scalar_create( graph, I32, &nms_kernel_method );
-            node_params[IOU_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold );
-            node_params[SIGMA] = vsi_nn_kernel_scalar_create( graph, F32, &sigma );
-            node_params[NMS_SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &nms_score_threshold );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[SCORE_THRESHOLD] );
-            vsi_nn_kernel_scalar_release( &node_params[MAX_NUM_DETECTIONS] );
-            vsi_nn_kernel_scalar_release( &node_params[NMS_KERNEL_METHOD] );
-            vsi_nn_kernel_scalar_release( &node_params[IOU_THRESHOLD] );
-            vsi_nn_kernel_scalar_release( &node_params[SIGMA] );
-            vsi_nn_kernel_scalar_release( &node_params[NMS_SCORE_THRESHOLD] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( box_with_nms_limit, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c b/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c
deleted file mode 100644
index b5bfbcb..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.bucketize")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _bucketize_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _BUCKETIZE_PARAM_NUM  _cnt_of_array( _bucketize_kernel_param_def )
-#define SCALAR_RIGHT_VALUE          (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i = 0, j = 0;
-    int32_t right = 0;
-    uint32_t boundaries_size = 0;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_RIGHT_VALUE], &(right));
-
-    boundaries_size = (uint32_t)in_attr[1]->shape->data[0];
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        float src0 = f32_in_buffer[0][i];
-        float dst = 0;
-
-        for (j = 0; j < boundaries_size; j++)
-        {
-            float src1 = f32_in_buffer[1][j];
-
-            if (right == 1)
-            {
-                dst += (src0 >= src1 ? 1.0f : 0.0f);
-            }
-            else
-            {
-                dst += (src0 > src1 ? 1.0f : 0.0f);
-            }
-        }
-
-        f32_out_buffer[0][i] = dst;
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _bucketize_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _bucketize_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t right  = vsi_nn_kernel_param_get_int32( params, "right" );
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            node_params[SCALAR_RIGHT_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &right );
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_RIGHT_VALUE] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( bucketize, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c
deleted file mode 100644
index 79cacfc..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "utils/vsi_nn_dtype_util_prv.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.cast")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _cast_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _CAST_PARAM_NUM  _cnt_of_array( _cast_kernel_param_def )
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    double     max_value = 0.0f, min_value = 0.0f;
-    vsi_bool clamp_flag = FALSE;
-    vsi_nn_type_e out_type;
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        in_attr[i]->quant             = VSI_NN_KERNEL_QUANT_NONE;
-        in_attr[i]->dfp.fl            = 0;
-        in_attr[i]->asymm.scale       = 1.0f;
-        in_attr[i]->asymm.zero_point  = 0;
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    out_type = vsi_nn_dtype_map_kernel(out_attr[0]->dtype);
-
-    if( type_is_integer( out_type ) )
-    {
-        clamp_flag = TRUE;
-        type_get_range(out_type, &max_value, &min_value);
-    }
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        float val = f32_in_buffer[0][i];
-        if (clamp_flag)
-        {
-            val = vsi_nn_clamp(val, (float)min_value, (float)max_value);
-        }
-        f32_out_buffer[0][i] = val;
-    }
-
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        out_attr[i]->quant             = VSI_NN_KERNEL_QUANT_NONE;
-        out_attr[i]->dfp.fl            = 0;
-        out_attr[i]->asymm.scale       = 1.0f;
-        out_attr[i]->asymm.zero_point  = 0;
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _cast_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _cast_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_CAST_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _CAST_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CAST_PARAM_NUM );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( cast, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c b/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c
deleted file mode 100644
index 5bb08de..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.clip")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _clip_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _CLIP_PARAM_NUM  _cnt_of_array( _clip_kernel_param_def )
-
-#define SCALAR_MIN_VALUE          (2)
-#define SCALAR_MAX_VALUE          (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    float     min_value = 0.0f;
-    float     max_value = 0.0f;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MIN_VALUE], &(min_value));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_VALUE], &(max_value));
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        f32_out_buffer[0][i] = vsi_nn_clamp(f32_in_buffer[0][i], min_value, max_value);
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _clip_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _clip_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_CLIP_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float   min_value  = vsi_nn_kernel_param_get_float32( params, "min_value" );
-    float   max_value  = vsi_nn_kernel_param_get_float32( params, "max_value" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value );
-            node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CLIP_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MIN_VALUE] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( clip, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
deleted file mode 100644
index a43f2f3..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("comparisons_sw")
-
-typedef enum
-{
-    COMP_GREAT = VSI_NN_RELATIONAL_OPS_GREAT,
-    COMP_GREAT_EQUAL = VSI_NN_RELATIONAL_OPS_GREAT_EQUAL,
-    COMP_LESS = VSI_NN_RELATIONAL_OPS_LESS,
-    COMP_LESS_EQUAL = VSI_NN_RELATIONAL_OPS_LESS_EQUAL,
-    COMP_NOT_EQUAL = VSI_NN_RELATIONAL_OPS_NOT_EQUAL,
-    COMP_EQUAL = VSI_NN_RELATIONAL_OPS_EQUAL,
-} relational_type_e;
-
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for( i = 0; i < rank && index; i ++ )
-    {
-        if( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-    return offset;
-}
-
-DEF_KERNEL_EXECUTOR(_comparisons_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    int32_t i = 0;
-    int32_t operation = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &operation);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-
-    vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] );
-    vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    for (i = 0; i < (int32_t)out_elements; i++)
-    {
-        vsi_ssize_t in0_offset = 0;
-        vsi_ssize_t in1_offset = 0;
-        float val1 = 0.f;
-        float val2 = 0.f;
-        vsi_bool data = 0;
-
-        in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size,
-                stride_size[0], attr[2]->shape->data );
-        in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size,
-                stride_size[1], attr[2]->shape->data );
-
-        val1 = buffer[0][in0_offset];
-        val2 = buffer[1][in1_offset];
-
-        switch (operation)
-        {
-        case COMP_GREAT:
-            data = val1 > val2;
-            break;
-        case COMP_GREAT_EQUAL:
-            data = val1 >= val2;
-            break;
-        case COMP_LESS:
-            data = val1 < val2;
-            break;
-        case COMP_LESS_EQUAL:
-            data = val1 <= val2;
-            break;
-        case COMP_EQUAL:
-            data = val1 == val2;
-            break;
-        case COMP_NOT_EQUAL:
-            data = val1 != val2;
-            break;
-        default:
-            break;
-        }
-        buffer[2][i] = (float)data;
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if (attr[0])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[0] );
-        attr[0] = NULL;
-    }
-    if (attr[1])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[1] );
-        attr[1] = NULL;
-    }
-    if (attr[2])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[2] );
-        attr[2] = NULL;
-    }
-
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-            buffer[i] = NULL;
-        }
-    }
-    return status;
-} /* _comparisons_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-#define INPUT_FUNC_OP           (3)
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _comparisons_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t operation = 0;
-
-    operation = vsi_nn_kernel_param_get_int32( params, "operation" );
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[INPUT_FUNC_OP] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &operation );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &backend_params[INPUT_FUNC_OP] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( relational_ops, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
deleted file mode 100644
index dd820df..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
+++ /dev/null
@@ -1,262 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.conv1d_ovxlib")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _conv1d_ovxlib_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _CONV1D_OVXLIB_PARAM_NUM  _cnt_of_array( _conv1d_ovxlib_kernel_param_def )
-#define _IO_COUNT       (4)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    int i = 0;
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_IO_COUNT] = { NULL };
-    vsi_nn_kernel_tensor_attr_t* attr[_IO_COUNT] = { NULL };
-    float* buffer[_IO_COUNT] = { NULL };
-    int32_t stride = 0;
-    int32_t pad_front = 0;
-    int32_t pad_end = 0;
-    int32_t dilation = 0;
-    int32_t overflow_policy = 0;
-    int32_t rounding_policy = 0;
-    int32_t down_scale_size_rounding = 0;
-
-    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
-    tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
-    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create input buffer fail.", final );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &stride);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &pad_front);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &pad_end);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &overflow_policy);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rounding_policy);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &down_scale_size_rounding);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-
-    {
-        vsi_ssize_t batch = attr[0]->shape->data[2];
-        vsi_ssize_t input_channel = attr[0]->shape->data[1];
-        vsi_ssize_t input_height = attr[0]->shape->data[0];
-        vsi_ssize_t kernel_size = attr[1]->shape->data[0];
-        vsi_ssize_t output_channel = attr[1]->shape->data[2];
-        vsi_ssize_t output_height = attr[3]->shape->data[0];
-        vsi_ssize_t batch_index = 0;
-        vsi_ssize_t input_channel_index = 0;
-        vsi_ssize_t output_channel_index = 0;
-        vsi_ssize_t output_h_index = 0;
-
-        for(batch_index = 0; batch_index < batch; batch_index++)
-        {
-            float* per_batch_input = buffer[0] + batch_index * input_channel * input_height;
-            float* per_batch_output = buffer[3] + batch_index * output_channel * output_height;
-            for(output_channel_index = 0; output_channel_index < output_channel; output_channel_index++)
-            {
-                float* filter = buffer[1] + output_channel_index * input_channel * kernel_size;
-                for(output_h_index = 0; output_h_index < output_height; output_h_index++)
-                {
-                    float output_value = 0.;
-                    float* current_value_ptr = per_batch_input + output_h_index * stride;
-
-                    for(input_channel_index = 0; input_channel_index < input_channel; input_channel_index++)
-                    {
-                        int k = 0;
-                        int32_t index = 0;
-                        for(k = 0; k < kernel_size; k++)
-                        {
-                            float w = *(filter + input_channel_index * kernel_size + k);
-                            float v = *(current_value_ptr + input_channel_index * input_height + index);
-
-                            output_value += w * v;
-                            index += dilation;
-                        }
-                    }
-
-                    if(buffer[2])
-                    {
-                        output_value += buffer[2][output_channel_index];
-                    }
-
-                    *(per_batch_output + output_channel_index * output_height + output_h_index) = output_value;
-                }
-            }
-        }
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-                buffer[3], batch * output_channel * output_height );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for( i = 0; i < _IO_COUNT; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-
-    return status;
-} /* _compute() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _conv1d_ovxlib_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _conv1d_ovxlib_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_CONV1D_OVXLIB_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int j = 0;
-
-    int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
-    int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" );
-    int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
-    int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" );
-    int32_t overflow_policy = vsi_nn_kernel_param_get_int32( params, "overflow_policy" );
-    int32_t rounding_policy = vsi_nn_kernel_param_get_int32( params, "rounding_policy" );
-    int32_t down_scale_size_rounding = vsi_nn_kernel_param_get_int32( params, "down_scale_size_rounding" );
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _CONV1D_OVXLIB_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            j = (int)(input_num + output_num);
-            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &stride );
-            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_front );
-            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_end );
-            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &dilation );
-            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &overflow_policy );
-            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &rounding_policy );
-            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &down_scale_size_rounding );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CONV1D_OVXLIB_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[--j] );
-            vsi_nn_kernel_scalar_release( &node_params[--j] );
-            vsi_nn_kernel_scalar_release( &node_params[--j] );
-            vsi_nn_kernel_scalar_release( &node_params[--j] );
-            vsi_nn_kernel_scalar_release( &node_params[--j] );
-            vsi_nn_kernel_scalar_release( &node_params[--j] );
-            vsi_nn_kernel_scalar_release( &node_params[--j] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( conv1d_ovxlib, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/cpu_backend_conv2d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_conv2d_cpu.c
deleted file mode 100644
index d273df6..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/cpu_backend_conv2d_cpu.c
+++ /dev/null
@@ -1,259 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "cpu_backend/npuref_interface.h"
-
-__BEGIN_DECLS
-
-typedef enum
-{
-    PARAM_INPUT = 0,
-    PARAM_KERNEL,
-    PARAM_BIAS,
-    PARAM_OUTPUT,
-    PARAM_STRIDE_0,
-    PARAM_STRIDE_1,
-    PARAM_PAD_0,
-    PARAM_PAD_1,
-    PARAM_PAD_2,
-    PARAM_PAD_3,
-    PARAM_DILATION_0,
-    PARAM_DILATION_1,
-    PARAM_MULTIPLIER,
-    PARAM_NUM
-} param_index_e;
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.cpu_backend_conv2d")
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-/*
- * Kernel params
- */
-static vx_param_description_t _cpu_backend_conv2d_kernel_param_def[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-#define _CPU_BACKEND_CONV2D_PARAM_NUM  _cnt_of_array( _cpu_backend_conv2d_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
-    int32_t strides[2];
-    int32_t pad[4];
-    int32_t dilation[2];
-    void * buffer[_IO_NUM] = { NULL };
-    int32_t i = 0;
-    vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-
-    tensors[0] = (vsi_nn_kernel_tensor_t)param[PARAM_INPUT];
-    tensors[1] = (vsi_nn_kernel_tensor_t)param[PARAM_KERNEL];
-    tensors[2] = (vsi_nn_kernel_tensor_t)param[PARAM_BIAS];
-    tensors[3] = (vsi_nn_kernel_tensor_t)param[PARAM_OUTPUT];
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    if ( param[PARAM_BIAS] )
-    {
-        attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-        CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    }
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[0] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[1] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_0], &pad[0] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_1], &pad[1] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[2] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[3] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_DILATION_0], &dilation[0] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_DILATION_1], &dilation[1] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create kernel buffer fail.", final );
-    if ( param[PARAM_BIAS] )
-    {
-        buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], FALSE );
-        CHECK_PTR_FAIL_GOTO( buffer[2], "Create bias buffer fail.", final );
-    }
-    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
-
-    npuref_interface_quant_conv2d(buffer[0], attr[0],
-        buffer[1], attr[1], buffer[2],
-        pad, strides, dilation, attr[3], buffer[3]);
-
-    status = vsi_nn_kernel_tensor_write( tensors[3], attr[3],
-        buffer[3], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for ( i = 0; i < _IO_NUM; i ++ )
-    {
-        if ( attr[i] )
-        {
-            vsi_nn_kernel_tensor_attr_release( &attr[i] );
-        }
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _cpu_backend_conv2d_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _cpu_backend_conv2d_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_CPU_BACKEND_CONV2D_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    size_t size = 0;
-    int32_t* stride = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "stride", &size);
-    int32_t* pad = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "pad", &size);
-    int32_t* dilation = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "dilation", &size);
-    int32_t multiplier = vsi_nn_kernel_param_get_int32(params, "multiplier");
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _CPU_BACKEND_CONV2D_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride[0] );
-            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &stride[1] );
-            node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &pad[0] );
-            node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &pad[1] );
-            node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad[2] );
-            node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad[3] );
-            node_params[10] = vsi_nn_kernel_scalar_create( graph, I32, &dilation[0] );
-            node_params[11] = vsi_nn_kernel_scalar_create( graph, I32, &dilation[1] );
-            node_params[12] = vsi_nn_kernel_scalar_create( graph, I32, &multiplier );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_BACKEND_CONV2D_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &node_params[4] );
-            vsi_nn_kernel_scalar_release( &node_params[5] );
-            vsi_nn_kernel_scalar_release( &node_params[6] );
-            vsi_nn_kernel_scalar_release( &node_params[7] );
-            vsi_nn_kernel_scalar_release( &node_params[8] );
-            vsi_nn_kernel_scalar_release( &node_params[9] );
-            vsi_nn_kernel_scalar_release( &node_params[10] );
-            vsi_nn_kernel_scalar_release( &node_params[11] );
-            vsi_nn_kernel_scalar_release( &node_params[12] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( cpu_backend_conv2d, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/cpu_backend_deconv2d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_deconv2d_cpu.c
deleted file mode 100644
index b1502a5..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/cpu_backend_deconv2d_cpu.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "cpu_backend/npuref_interface.h"
-
-__BEGIN_DECLS
-
-typedef enum
-{
-    PARAM_INPUT = 0,
-    PARAM_KERNEL,
-    PARAM_BIAS,
-    PARAM_OUTPUT,
-    PARAM_STRIDE_0,
-    PARAM_STRIDE_1,
-    PARAM_PAD_0,
-    PARAM_PAD_1,
-    PARAM_PAD_2,
-    PARAM_PAD_3,
-
-    PARAM_NUM
-} param_index_e;
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.cpu_backend_deconv2d")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _cpu_backend_deconv2d_kernel_param_def[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-#define _CPU_BACKEND_DECONV2D_PARAM_NUM  _cnt_of_array( _cpu_backend_deconv2d_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
-    int32_t strides[2];
-    int dilation[2] = {1, 1};
-    int32_t pad[4];
-    void * buffer[_IO_NUM] = { NULL };
-    int32_t i = 0;
-    vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-
-    tensors[0] = (vsi_nn_kernel_tensor_t)param[PARAM_INPUT];
-    tensors[1] = (vsi_nn_kernel_tensor_t)param[PARAM_KERNEL];
-    tensors[2] = (vsi_nn_kernel_tensor_t)param[PARAM_BIAS];
-    tensors[3] = (vsi_nn_kernel_tensor_t)param[PARAM_OUTPUT];
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    if ( param[PARAM_BIAS] )
-    {
-        attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-        CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    }
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[0] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[1] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_0], &pad[0] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_1], &pad[1] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[2] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[3] );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create kernel buffer fail.", final );
-    if ( param[PARAM_BIAS] )
-    {
-        buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], FALSE );
-        CHECK_PTR_FAIL_GOTO( buffer[2], "Create bias buffer fail.", final );
-    }
-    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
-
-    npuref_interface_quant_deconv2d(buffer[0], attr[0],
-        buffer[1], attr[1], buffer[2],
-        pad, strides, dilation, attr[3], buffer[3]);
-
-    status = vsi_nn_kernel_tensor_write( tensors[3], attr[3],
-        buffer[3], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for ( i = 0; i < _IO_NUM; i ++ )
-    {
-        if ( attr[i] )
-        {
-            vsi_nn_kernel_tensor_attr_release( &attr[i] );
-        }
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _cpu_backend_deconv2d_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _cpu_backend_deconv2d_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_CPU_BACKEND_DECONV2D_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    size_t size = 0;
-    int32_t* stride = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "stride", &size);
-    int32_t* pad = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "pad", &size);
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _CPU_BACKEND_DECONV2D_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride[0] );
-            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &stride[1] );
-            node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &pad[0] );
-            node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &pad[1] );
-            node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad[2] );
-            node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad[3] );
-
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_BACKEND_DECONV2D_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &node_params[4] );
-            vsi_nn_kernel_scalar_release( &node_params[5] );
-            vsi_nn_kernel_scalar_release( &node_params[6] );
-            vsi_nn_kernel_scalar_release( &node_params[7] );
-            vsi_nn_kernel_scalar_release( &node_params[8] );
-            vsi_nn_kernel_scalar_release( &node_params[9] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( cpu_backend_deconv2d, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c
deleted file mode 100644
index 29f333d..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (3)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.cumsum")
-
-DEF_KERNEL_EXECUTOR(_cumsum_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[2] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t i = 0;
-    int32_t axisSize = 1, innerSize = 1, outerSize = 1;
-    int32_t axis = 0, exclusive = 0, reverse = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &exclusive);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    {
-        int32_t  dims_num  = (int32_t)attr[1]->shape->size;
-        int32_t  inner     = 0;
-        int32_t  outer     = 0;
-
-        for(i = 0; i < axis; ++i)
-        {
-            innerSize *= (int32_t)attr[0]->shape->data[i];
-        }
-
-        axisSize = (int32_t)attr[0]->shape->data[i++];
-
-        for(; i < dims_num; ++i)
-        {
-            outerSize *= (int32_t)attr[0]->shape->data[i];
-        }
-
-        for ( outer = 0; outer < outerSize; ++outer)
-        {
-            for ( inner = 0; inner < innerSize; ++inner)
-            {
-                float sum = .0f;
-
-                if (exclusive && reverse)
-                {
-                    int32_t idx_out = (outer * axisSize + axisSize - 1) * innerSize + inner;
-                    buffer[1][idx_out] = sum;
-                    for (i = axisSize - 1; i > 0; i--)
-                    {
-                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
-                        float value = buffer[0][idx];
-                        idx_out = (outer * axisSize + i - 1) * innerSize + inner;
-                        sum += value;
-                        buffer[1][idx_out] = sum;
-                    }
-                }
-                else if (exclusive)
-                {
-                    int32_t idx_out = outer * axisSize * innerSize + inner;
-                    buffer[1][idx_out] = sum;
-                    for (i = 0; i < axisSize - 1; ++i)
-                    {
-                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
-                        float value = buffer[0][idx];
-                        idx_out = (outer * axisSize + i + 1) * innerSize + inner;
-                        sum += value;
-                        buffer[1][idx_out] = sum;
-                    }
-                }
-                else if (reverse)
-                {
-                    for (i = axisSize - 1; i >= 0; i--)
-                    {
-                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
-                        float value = buffer[0][idx];
-                        sum += value;
-                        buffer[1][idx] = sum;
-                    }
-                }
-                else
-                {
-                    for (i = 0; i < axisSize; ++i)
-                    {
-                        // i * innerSize + inner + outer * innerSize * axisSize
-                        int32_t idx = (outer * axisSize + i) * innerSize + inner;
-                        float value = buffer[0][idx];
-                        sum += value;
-                        buffer[1][idx] = sum;
-                    }
-                }
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for ( i = 0; i < 2; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for ( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _cumsum_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _cumsum_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _CUMSUM_PARAM_NUM  _cnt_of_array( _cumsum_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _cumsum_exec;
-    kernel->info.parameters  = _cumsum_kernel_param_def;
-    kernel->info.numParams   = _CUMSUM_PARAM_NUM;
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            uint32_t index = 2;
-            int32_t axis      = vsi_nn_kernel_param_get_int32( params, "axis" );
-            int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" );
-            int32_t reverse   = vsi_nn_kernel_param_get_int32( params, "reverse" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[2] );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( cumsum, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
deleted file mode 100644
index dea83c9..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.depth2space_crd")
-
-DEF_KERNEL_EXECUTOR(_depth2space_crd_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[2] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t block_size = 1;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    {
-        vsi_size_t output_batch = attr[1]->shape->size > 3 ? attr[1]->shape->data[3] : 1;
-        vsi_size_t output_depth = attr[1]->shape->data[2];
-        vsi_size_t output_height = attr[1]->shape->data[1];
-        vsi_size_t output_width = attr[1]->shape->data[0];
-        vsi_size_t input_depth = attr[0]->shape->data[2];
-        vsi_size_t input_height = attr[0]->shape->data[1];
-        vsi_size_t input_width = attr[0]->shape->data[0];
-        vsi_size_t batch = 0, out_h = 0, out_w = 0;
-
-        for (batch = 0; batch < output_batch; ++ batch)
-        {
-            vsi_size_t output_batch_index = batch * output_height * output_width * output_depth;
-            vsi_size_t input_batch_index = batch * input_height * input_width * input_depth;
-            vsi_size_t out_d = 0;
-            vsi_size_t block_e2 = block_size * block_size;
-
-            for (out_d = 0; out_d < output_depth; out_d ++)
-            {
-                for (out_h = 0; out_h < output_height; ++ out_h)
-                {
-                    for (out_w = 0; out_w < output_width; out_w ++)
-                    {
-                        vsi_size_t in_w = out_w / block_size;
-                        vsi_size_t in_h = out_h / block_size;
-                        vsi_size_t in_d = (out_w  % block_size) + (out_h % block_size) * block_size + out_d * block_e2;
-
-                        vsi_size_t in_index = in_w + in_h * input_width +  in_d * input_width * input_height
-                                            + input_batch_index;
-                        vsi_size_t out_index = out_w + out_h * output_width + out_d * output_height * output_width
-                                            + output_batch_index;
-
-                        buffer[1][out_index] = buffer[0][in_index];
-                    }
-                }
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    return status;
-} /* _depth2space_crd_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _depth2space_crd_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _DEPTH2SPACE_CRD_PARAM_NUM  _cnt_of_array( _depth2space_crd_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _depth2space_crd_exec;
-    kernel->info.parameters  = _depth2space_crd_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _depth2space_crd_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 2;
-            int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[2] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( depth2space_internal, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c
deleted file mode 100644
index e6c787b..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c
+++ /dev/null
@@ -1,273 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "cpu_backend/npuref_interface.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-typedef enum
-{
-    PARAM_INPUT = 0,
-    PARAM_KERNEL,
-    PARAM_BIAS,
-    PARAM_OUTPUT,
-    PARAM_STRIDE,
-    PARAM_PAD_FRONT,
-    PARAM_PAD_END,
-    PARAM_DILATION,
-    PARAM_MULTIPLIER,
-    PARAM_NUM
-} param_index_e;
-
-#define _INPUT_NUM          (PARAM_NUM)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.depthwise_conv1d")
-#define _IO_NUM             (4)
-
-/*
- * Kernel params
- */
-static vx_param_description_t _depthwise_conv1d_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _DEPTHWISE_CONV1D_PARAM_NUM  _cnt_of_array( _depthwise_conv1d_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
-    int32_t stride;
-    int32_t pad_front;
-    int32_t pad_end;
-    int32_t dilation;
-    int32_t multiplier;
-    void * buffer[_IO_NUM] = { NULL };
-    int32_t i = 0;
-    vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-
-    tensors[0] = (vsi_nn_kernel_tensor_t)param[PARAM_INPUT];
-    tensors[1] = (vsi_nn_kernel_tensor_t)param[PARAM_KERNEL];
-    tensors[2] = (vsi_nn_kernel_tensor_t)param[PARAM_BIAS];
-    tensors[3] = (vsi_nn_kernel_tensor_t)param[PARAM_OUTPUT];
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    if( param[PARAM_BIAS] )
-    {
-        attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-        CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    }
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE], &stride );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_FRONT], &pad_front );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_END], &pad_end );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_DILATION], &dilation );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_MULTIPLIER], &multiplier );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create kernel buffer fail.", final );
-    if( param[PARAM_BIAS] )
-    {
-        buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], FALSE );
-        CHECK_PTR_FAIL_GOTO( buffer[2], "Create bias buffer fail.", final );
-    }
-    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
-
-
-    {
-        // Use conv2d compute
-        int32_t input_shape_4d[4] = {1,0,0,0};
-        int32_t kernel_shape_4d[4] = {1,0,0,0};
-        int32_t output_shape_4d[4] = {1,0,0,0};
-        memcpy( &input_shape_4d[1], attr[0]->shape->data, 3 * sizeof(int32_t) );
-        memcpy( &kernel_shape_4d[1], attr[1]->shape->data, 3 * sizeof(int32_t) );
-        memcpy( &output_shape_4d[1], attr[3]->shape->data, 3 * sizeof(int32_t) );
-        npuref_interface_quant_depthwise_conv2d(
-                buffer[0], buffer[1], buffer[2],
-                input_shape_4d, 4,
-                kernel_shape_4d, 4,
-                output_shape_4d, 4,
-                attr[0]->asymm.scale, attr[0]->asymm.zero_point,
-                attr[1]->asymm.scale, attr[1]->asymm.zero_point,
-                attr[3]->asymm.scale, attr[3]->asymm.zero_point,
-                pad_front, pad_end, 0, 0,
-                stride, 1, dilation, 1,
-                buffer[3]
-                );
-        status = vsi_nn_kernel_tensor_write( tensors[3], attr[3],
-                buffer[3], out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for( i = 0; i < _IO_NUM; i ++ )
-    {
-        if( attr[i] )
-        {
-            vsi_nn_kernel_tensor_attr_release( &attr[i] );
-        }
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _depthwise_conv1d_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _depthwise_conv1d_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_DEPTHWISE_CONV1D_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t stride     = vsi_nn_kernel_param_get_int32( params, "stride" );
-    int32_t pad_front  = vsi_nn_kernel_param_get_int32( params, "pad_front" );
-    int32_t pad_end    = vsi_nn_kernel_param_get_int32( params, "pad_end" );
-    int32_t dilation   = vsi_nn_kernel_param_get_int32( params, "dilation" );
-    int32_t multiplier = vsi_nn_kernel_param_get_int32( params, "multiplier" );
-
-    if(!( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8
-            && inputs[1]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8
-            && outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8))
-    {
-        //TODO: Support other types
-        return NULL;
-    }
-
-    if( !npuref_exists() )
-    {
-        return NULL;
-    }
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _DEPTHWISE_CONV1D_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[PARAM_STRIDE] = vsi_nn_kernel_scalar_create( graph, I32, &stride );
-            node_params[PARAM_PAD_FRONT] = vsi_nn_kernel_scalar_create( graph, I32, &pad_front );
-            node_params[PARAM_PAD_END] = vsi_nn_kernel_scalar_create( graph, I32, &pad_end );
-            node_params[PARAM_DILATION] = vsi_nn_kernel_scalar_create( graph, I32, &dilation );
-            node_params[PARAM_MULTIPLIER] = vsi_nn_kernel_scalar_create( graph, I32, &multiplier );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params,
-                    _DEPTHWISE_CONV1D_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[PARAM_STRIDE] );
-            vsi_nn_kernel_scalar_release( &node_params[PARAM_PAD_FRONT] );
-            vsi_nn_kernel_scalar_release( &node_params[PARAM_PAD_END] );
-            vsi_nn_kernel_scalar_release( &node_params[PARAM_DILATION] );
-            vsi_nn_kernel_scalar_release( &node_params[PARAM_MULTIPLIER] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( depthwise_conv1d, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c b/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c
deleted file mode 100644
index 48de41c..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.detect_post_box")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _detect_post_box_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _DETECT_POST_BOX_PARAM_NUM  _cnt_of_array( _detect_post_box_kernel_param_def )
-
-#define SCALAR_SCALE_Y   (3)
-#define SCALAR_SCALE_X   (4)
-#define SCALAR_SCALE_H   (5)
-#define SCALAR_SCALE_W   (6)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    vsi_size_t  n, a, numBatches, numAnchors, lengthBoxEncoding;
-    uint32_t  kRoiDim = 4;
-    float     inv_scale_y = 0.0f;
-    float     inv_scale_x = 0.0f;
-    float     inv_scale_h = 0.0f;
-    float     inv_scale_w = 0.0f;
-
-    /* prepare data */
-    for ( i = 0; i < _INPUT_NUM; i++ )
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_Y], &(inv_scale_y));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_X], &(inv_scale_x));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_H], &(inv_scale_h));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_W], &(inv_scale_w));
-
-    numBatches = in_attr[0]->shape->data[2];
-    numAnchors = in_attr[0]->shape->data[1];
-    lengthBoxEncoding = in_attr[0]->shape->data[0];
-
-    for ( n = 0; n < numBatches; n++ )
-    {
-        vsi_ssize_t batch_in_offset  = n * numAnchors * lengthBoxEncoding;
-        vsi_ssize_t batch_out_offset = n * numAnchors * kRoiDim;
-        for ( a = 0; a < numAnchors; a++ )
-        {
-            float yCtr = f32_in_buffer[1][a * kRoiDim] + f32_in_buffer[1][a * kRoiDim + 2]
-                * f32_in_buffer[0][batch_in_offset + a * lengthBoxEncoding] * inv_scale_y;
-            float xCtr = f32_in_buffer[1][a * kRoiDim + 1] + f32_in_buffer[1][a * kRoiDim + 3]
-                * f32_in_buffer[0][batch_in_offset + a * lengthBoxEncoding + 1] * inv_scale_x;
-            float hHalf = f32_in_buffer[1][a * kRoiDim + 2] *
-                (float)exp(f32_in_buffer[0][batch_in_offset + a * lengthBoxEncoding + 2] * inv_scale_h) * 0.5f;
-            float wHalf = f32_in_buffer[1][a * kRoiDim + 3] *
-                (float)exp(f32_in_buffer[0][batch_in_offset + a * lengthBoxEncoding + 3] * inv_scale_w) * 0.5f;
-            f32_out_buffer[0][batch_out_offset + a * kRoiDim] = yCtr - hHalf;
-            f32_out_buffer[0][batch_out_offset + a * kRoiDim + 1] = xCtr - wHalf;
-            f32_out_buffer[0][batch_out_offset + a * kRoiDim + 2] = yCtr + hHalf;
-            f32_out_buffer[0][batch_out_offset + a * kRoiDim + 3] = xCtr + wHalf;
-        }
-    }
-
-
-    /* save data */
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for ( i = 0; i < _INPUT_NUM; i++ )
-    {
-        if ( f32_in_buffer[i] )
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if ( in_attr[i] )
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        if ( f32_out_buffer[i] )
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if ( out_attr[i] )
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _detect_post_box_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _detect_post_box_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_DETECT_POST_BOX_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float   inv_scale_y  = vsi_nn_kernel_param_get_float32( params, "inv_scale_y" );
-    float   inv_scale_x  = vsi_nn_kernel_param_get_float32( params, "inv_scale_x" );
-    float   inv_scale_h  = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" );
-    float   inv_scale_w  = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status )
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_BOX_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_y );
-            node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_x );
-            node_params[SCALAR_SCALE_H] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_h );
-            node_params[SCALAR_SCALE_W] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_w );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _DETECT_POST_BOX_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_H] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_W] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( detect_post_box, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c b/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c
deleted file mode 100644
index 3092350..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (4)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.detect_post_nms")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _detect_post_nms_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _DETECT_POST_NMS_PARAM_NUM  _cnt_of_array( _detect_post_nms_kernel_param_def )
-
-#define SCALAR_NMS_TYPE     (6)
-#define SCALAR_MAX_NUM      (7)
-#define SCALAR_MAX_CLASS    (8)
-#define SCALAR_MAX_DETECT   (9)
-#define SCALAR_SCORE_TH     (10)
-#define SCALAR_IOU_TH       (11)
-#define SCALAR_IS_BG        (12)
-
-static void _swap_element
-    (
-    uint32_t* list,
-    uint32_t first,
-    uint32_t second
-    )
-{
-    uint32_t temp = list[first];
-    list[first] = list[second];
-    list[second] = temp;
-}
-
-static uint32_t _max_element
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    )
-{
-    uint32_t i;
-    uint32_t max_index = 0;
-    float max_val = data[index_list[0]];
-    for ( i = 1; i < len; i++ )
-    {
-        float val = data[index_list[i]];
-        if ( max_val < val )
-        {
-            max_val = val;
-            max_index = i;
-        }
-    }
-    return max_index;
-}
-
-static float _getIoUAxisAligned
-    (
-    const float* roi1,
-    const float* roi2
-    )
-{
-    const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
-    const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
-    const float x1 = vsi_nn_max(roi1[0], roi2[0]);
-    const float x2 = vsi_nn_min(roi1[2], roi2[2]);
-    const float y1 = vsi_nn_max(roi1[1], roi2[1]);
-    const float y2 = vsi_nn_min(roi1[3], roi2[3]);
-    const float w = vsi_nn_max(x2 - x1, 0.0f);
-    const float h = vsi_nn_max(y2 - y1, 0.0f);
-    const float areaIntersect = w * h;
-    const float areaUnion = area1 + area2 - areaIntersect;
-    return areaIntersect / areaUnion;
-}
-
-static uint32_t _max_comp_func
-    (
-    void* data,
-    int32_t left,
-    int32_t right
-    )
-{
-    float* fdata = (float*)data;
-    return fdata[left] >= fdata[right];
-}
-
-static void _sort_element_by_score
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    )
-{
-    vsi_nn_partition(data, 0, len - 1, _max_comp_func, TRUE, index_list);
-}
-
-static float _max_element_value
-    (
-    float* data,
-    uint32_t len
-    )
-{
-    uint32_t i;
-    float max_val = data[0];
-    for ( i = 1; i < len; i++ )
-    {
-        float val = data[i];
-        if ( max_val < val )
-        {
-            max_val = val;
-        }
-    }
-    return max_val;
-}
-
-static void _iota
-    (
-    int32_t * data,
-    uint32_t len,
-    int32_t value
-    )
-{
-    uint32_t i;
-    for ( i = 0; i < len; i++ )
-    {
-        data [i] = value;
-        value++;
-    }
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i, j;
-    vsi_size_t  n, a, c, b, numBatches, numAnchors, numClasses;
-    int32_t nms_type = 0;
-    int32_t max_num_detections = 0;
-    int32_t maximum_class_per_detection = 0;
-    int32_t maximum_detection_per_class = 0;
-    float   score_threshold  = 0.0f;
-    float   iou_threshold    = 0.0f;
-    int32_t is_bg_in_label   = 0;
-    vsi_size_t numOutDetection = 0;
-
-    /* prepare data */
-    for ( i = 0; i < _INPUT_NUM; i++ )
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_NMS_TYPE], &(nms_type));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_NUM], &(max_num_detections));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_CLASS], &(maximum_class_per_detection));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_DETECT], &(maximum_detection_per_class));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCORE_TH], &(score_threshold));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_IOU_TH], &(iou_threshold));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_IS_BG], &(is_bg_in_label));
-
-    numBatches      = in_attr[0]->shape->data[2];
-    numAnchors      = in_attr[0]->shape->data[1];
-    numClasses      = in_attr[0]->shape->data[0];
-    numOutDetection = out_attr[0]->shape->data[0];
-
-    {
-        vsi_size_t scores_index = 0;
-        vsi_size_t scores_out_index = 0;
-        uint32_t kRoiDim = 4;
-        vsi_size_t roi_out_index = 0;
-        vsi_size_t class_out_index = 0;
-        uint32_t* select = (uint32_t*)malloc(numAnchors * numClasses * sizeof(uint32_t));
-        float* maxScores = (float*)malloc(numAnchors * sizeof(float));
-        uint32_t* scoreInds = (uint32_t*)malloc((numClasses - 1) * sizeof(uint32_t));
-
-        for ( n = 0; n < numBatches; n++ )
-        {
-            float* roiBuffer = &(f32_in_buffer[1][n * numAnchors * kRoiDim]);
-            if (nms_type)
-            {
-                uint32_t select_size = 0;
-                uint32_t select_start = 0;
-                uint32_t select_len = 0;
-                uint32_t numDetections = 0;
-                for ( c = 1; c < numClasses; c++ )
-                {
-                    select_start = select_size;
-                    for ( b = 0; b < numAnchors; b++ )
-                    {
-                        const vsi_size_t index = b * numClasses + c;
-                        float score = f32_in_buffer[0][scores_index + index];
-                        if (score > score_threshold) {
-                            select[select_size] = (uint32_t)index;
-                            select_size++;
-                        }
-                    }
-                    select_len = select_size - select_start;
-
-                    if ( maximum_detection_per_class < 0 )
-                    {
-                        maximum_detection_per_class = select_len;
-                    }
-                    numDetections = 0;
-                    for ( j = 0; (j < select_len && numDetections < (uint32_t)maximum_detection_per_class); j++ )
-                    {
-                        // find max score and swap to the front.
-                        int32_t max_index = _max_element(&(f32_in_buffer[0][scores_index]),
-                            &(select[select_start]), select_len);
-                        _swap_element(&(select[select_start]), max_index, j);
-
-                        // Calculate IoU of the rest, swap to the end (disgard) if needed.
-                        for ( i = j + 1; i < select_len; i++ )
-                        {
-                            vsi_ssize_t roiBase0 = (select[select_start + i] / numClasses) * kRoiDim;
-                            vsi_ssize_t roiBase1 = (select[select_start + j] / numClasses) * kRoiDim;
-                            float iou = _getIoUAxisAligned(&(roiBuffer[roiBase0]),
-                                &(roiBuffer[roiBase1]));
-
-                            if ( iou >= iou_threshold )
-                            {
-                                _swap_element(&(select[select_start]), i, select_len - 1);
-                                i--;
-                                select_len--;
-                            }
-                        }
-                        numDetections++;
-                    }
-                    select_size = select_start + numDetections;
-                }
-
-                select_len = select_size;
-                select_start = 0;
-
-                // Take top maxNumDetections.
-                _sort_element_by_score(&(f32_in_buffer[0][scores_index]),
-                    &(select[select_start]), select_len);
-
-                for ( i = 0; i < select_len; i++ )
-                {
-                    uint32_t ind = select[i];
-                    f32_out_buffer[0][scores_out_index + i] =
-                        f32_in_buffer[0][scores_index + ind];
-                    memcpy(&(f32_out_buffer[1][roi_out_index + i * kRoiDim]),
-                        &roiBuffer[(ind / numClasses) * kRoiDim], kRoiDim * sizeof(float));
-                    f32_out_buffer[2][class_out_index + i] = (float)((ind % numClasses)
-                        - (is_bg_in_label ? 0 : 1));
-                }
-                f32_out_buffer[3][n] = (float)(select_len);
-            }
-            else
-            {
-                vsi_size_t numOutClasses = vsi_nn_min(numClasses - 1, (uint32_t)maximum_class_per_detection);
-                uint32_t select_size = 0;
-                uint32_t select_start = 0;
-                uint32_t select_len = 0;
-                uint32_t numDetections = 0;
-                for ( a = 0; a < numAnchors; a++ )
-                {
-                    // exclude background class: 0
-                    maxScores[a] = _max_element_value(&(f32_in_buffer[0]
-                        [scores_index + a * numClasses + 1]), (uint32_t)(numClasses - 1));
-                    if (maxScores[a] > score_threshold)
-                    {
-                            select[select_size] = (uint32_t)a;
-                            select_size++;
-                    }
-                }
-                select_len = select_size - select_start;
-
-                if ( max_num_detections < 0 )
-                {
-                    max_num_detections = select_len;
-                }
-                for ( j = 0; (j < select_len && numDetections < (uint32_t)max_num_detections); j++ )
-                {
-                    // find max score and swap to the front.
-                    int32_t max_index = _max_element(maxScores,
-                        &(select[select_start + j]), select_len - j);
-                    _swap_element(&(select[select_start]), max_index + j, j);
-
-                    // Calculate IoU of the rest, swap to the end (disgard) if needed.
-                    for ( i = j + 1; i < select_len; i++ )
-                    {
-                        int32_t roiBase0 = select[select_start + i] * kRoiDim;
-                        int32_t roiBase1 = select[select_start + j] * kRoiDim;
-                        float iou = _getIoUAxisAligned(&(roiBuffer[roiBase0]),
-                            &(roiBuffer[roiBase1]));
-                        if ( iou >= iou_threshold )
-                        {
-                            _swap_element(&(select[select_start]), i, select_len - 1);
-                            i--;
-                            select_len--;
-                        }
-                    }
-                    numDetections++;
-                }
-                select_size = select_start + numDetections;
-                select_len = select_size;
-
-                for ( i = 0; i < select_len; i++ )
-                {
-                    _iota((int32_t*)scoreInds, (uint32_t)(numClasses - 1), 1);
-                    _sort_element_by_score(&(f32_in_buffer[0][scores_index + select[i] * numClasses]),
-                        scoreInds, (uint32_t)(numClasses - 1));
-                    for (c = 0; c < numOutClasses; c++)
-                    {
-                        f32_out_buffer[0][scores_out_index + i * numOutClasses + c] =
-                            f32_in_buffer[0][scores_index + select[i] * numClasses + scoreInds[c]];
-                        memcpy(&(f32_out_buffer[1][roi_out_index + (i * numOutClasses + c)
-                            * kRoiDim]), &roiBuffer[select[i] * kRoiDim], kRoiDim * sizeof(float));
-                        f32_out_buffer[2][class_out_index + i * numOutClasses + c]
-                            = (float)(scoreInds[c] - (is_bg_in_label ? 0 : 1));
-                    }
-                }
-                f32_out_buffer[3][n] = (float)select_len;
-            }
-            scores_index += numAnchors * numClasses;
-            scores_out_index += numOutDetection;
-            roi_out_index += numOutDetection * kRoiDim;
-            class_out_index += numOutDetection;
-        }
-
-        if (select) free(select);
-        if (maxScores) free(maxScores);
-        if (scoreInds) free(scoreInds);
-    }
-    /* save data */
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _detect_post_nms_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _detect_post_nms_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_DETECT_POST_NMS_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t nms_type = vsi_nn_kernel_param_get_int32( params, "nms_type" );
-    int32_t max_num_detections = vsi_nn_kernel_param_get_int32( params, "max_num_detections" );
-    int32_t maximum_class_per_detection = vsi_nn_kernel_param_get_int32( params, "maximum_class_per_detection" );
-    int32_t maximum_detection_per_class = vsi_nn_kernel_param_get_int32( params, "maximum_detection_per_class" );
-    float   score_threshold  = vsi_nn_kernel_param_get_float32( params, "score_threshold" );
-    float   iou_threshold    = vsi_nn_kernel_param_get_float32( params, "iou_threshold" );
-    int32_t is_bg_in_label   = vsi_nn_kernel_param_get_int32( params, "is_bg_in_label" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status )
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_NMS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_NMS_TYPE]   = vsi_nn_kernel_scalar_create( graph, I32, &nms_type );
-            node_params[SCALAR_MAX_NUM]    = vsi_nn_kernel_scalar_create( graph, I32, &max_num_detections );
-            node_params[SCALAR_MAX_CLASS]  = vsi_nn_kernel_scalar_create( graph, I32, &maximum_class_per_detection );
-            node_params[SCALAR_MAX_DETECT] = vsi_nn_kernel_scalar_create( graph, I32, &maximum_detection_per_class );
-            node_params[SCALAR_SCORE_TH]   = vsi_nn_kernel_scalar_create( graph, F32, &score_threshold );
-            node_params[SCALAR_IOU_TH]     = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold );
-            node_params[SCALAR_IS_BG]      = vsi_nn_kernel_scalar_create( graph, I32, &is_bg_in_label );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _DETECT_POST_NMS_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_NMS_TYPE] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_NUM] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_CLASS] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_DETECT] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCORE_TH] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_IOU_TH] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_BG] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( detect_post_nms, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
deleted file mode 100644
index 061d5bc..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ /dev/null
@@ -1,393 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/** Unary Kernel internal type */
-typedef enum
-{
-    UNARY_SIN,
-    UNARY_COS,
-    UNARY_EXP,
-    UNARY_LOG,
-    UNARY_NEG,
-    UNARY_HSIGMOID,
-    UNARY_MISH,
-    UNARY_ROUND,
-    UNARY_GELU,
-    UNARY_HGELU,
-    UNARY_SELU,
-    UNARY_CELU,
-    UNARY_RCP,
-    UNARY_SIGN,
-    UNARY_SOFTSIGN,
-} unary_type_e;
-
-
-#define _CPU_ARG_NUM            (3)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("eltwise_unary_sw")
-
-static float exp_eval(float data)
-{
-    return expf(data);
-}
-
-static float sin_eval(float data)
-{
-    return sinf(data);
-}
-
-static float cos_eval(float data)
-{
-    return cosf(data);
-}
-
-static float log_eval(float data)
-{
-    return logf(data);
-}
-
-static float neg_eval(float data)
-{
-    return data * -1.0f;
-}
-
-static float hsigmoid_eval(float data, float alpha, float beta)
-{
-    data = (float)(alpha * data + beta);
-    data = vsi_nn_clamp(data, 0, 1);
-
-    return data;
-}
-
-static float soft_plus_eval(float data)
-{
-    return log_eval(exp_eval(data) + 1);
-}
-
-static float mish_eval(float data)
-{
-    data = (float)(data * tanh(soft_plus_eval(data)));
-
-    return data;
-}
-
-static float round_eval(float data)
-{
-    data = (float)(vsi_rtne(data));
-
-    return data;
-}
-
-static float gelu_eval(float data)
-{
-    data = (float)(0.5f * data * (1 + vsi_nn_erf_impl(data / (float)sqrt(2.0f))));
-
-    return data;
-}
-
-#define VSI_SQRT_2_RCP_PI  0.7978845834732056f
-static float hgelu_eval(float data)
-{
-    float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI *
-        (data + 0.044715f * data * data * data)))));
-
-    return data * cdf;
-}
-
-static float selu_eval(float data, float alpha, float gamma)
-{
-    float y0 = alpha * gamma * expf(data) - alpha * gamma;
-    float y1 = gamma * data;
-    float y = data <= 0 ? y0 : y1;
-
-    return y;
-}
-
-static float celu_eval(float x, float alpha)
-{
-    float positive = vsi_nn_max(0, x);
-    float negative = vsi_nn_min(alpha * (expf(x / alpha) - 1), 0);
-
-    return positive + negative;
-}
-
-static float rcp_eval(float x)
-{
-    return 1 / x;
-}
-
-static float sign_eval(float x)
-{
-    return x > 0 ? 1.0f : x < 0 ? -1.0f : 0;
-}
-
-static float softsign_eval(float x)
-{
-    return x / (1.0f + vsi_abs(x));
-}
-
-DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t i;
-    float alpha = 0;
-    float beta = 0;
-    int32_t unary_type = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &unary_type);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &alpha);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &beta);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    for ( i = 0; i < (int32_t)out_elements; ++i)
-    {
-        float data = buffer[0][i];
-
-        switch (unary_type)
-        {
-        case UNARY_SIN:
-            data = sin_eval(data);
-            break;
-        case UNARY_COS:
-            data = cos_eval(data);
-            break;
-        case UNARY_EXP:
-            data = exp_eval(data);
-            break;
-        case UNARY_LOG:
-            data = log_eval(data);
-            break;
-        case UNARY_NEG:
-            data = neg_eval(data);
-            break;
-        case UNARY_HSIGMOID:
-            data = hsigmoid_eval(data, alpha, beta);
-            break;
-        case UNARY_MISH:
-            data = mish_eval(data);
-            break;
-        case UNARY_ROUND:
-            data = round_eval(data);
-            break;
-        case UNARY_GELU:
-            data = gelu_eval(data);
-            break;
-        case UNARY_HGELU:
-            data = hgelu_eval(data);
-            break;
-        case UNARY_SELU:
-            data = selu_eval(data, alpha, beta);
-            break;
-        case UNARY_CELU:
-            data = celu_eval(data, alpha);
-            break;
-        case UNARY_RCP:
-            data = rcp_eval(data);
-            break;
-        case UNARY_SIGN:
-            data = sign_eval(data);
-            break;
-        case UNARY_SOFTSIGN:
-            data = softsign_eval(data);
-            break;
-        default:
-            break;
-        }
-        buffer[1][i] = (float)data;
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
-    SAFE_FREE_TENSOR_ATTR(attr[0]);
-    SAFE_FREE_TENSOR_ATTR(attr[1]);
-#undef SAFE_FREE_TENSOR_ATTR
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-            buffer[i] = NULL;
-        }
-    }
-    return status;
-} /* _eltwise_unary_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-#define INPUT_FUNC_TYPE           (2)
-#define INPUT_SCALAR_ALPHA        (3)
-#define INPUT_SCALAR_BETA         (4)
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _eltwise_unary_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel,
-    const unary_type_e            unary_type
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
-    float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[INPUT_FUNC_TYPE] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &unary_type );
-            backend_params[INPUT_SCALAR_ALPHA] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &alpha );
-            backend_params[INPUT_SCALAR_BETA] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &beta );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &backend_params[INPUT_FUNC_TYPE] );
-            vsi_nn_kernel_scalar_release( &backend_params[INPUT_SCALAR_ALPHA] );
-            vsi_nn_kernel_scalar_release( &backend_params[INPUT_SCALAR_BETA] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-#define REGISTER_ELTWISE_UNARY_BACKEND_CPU(KERNEL_NAME, UNARY_TYPE) \
-    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
-        ( \
-        vsi_nn_graph_t              * graph, \
-        vsi_nn_tensor_t            ** inputs, \
-        size_t                        input_num, \
-        vsi_nn_tensor_t            ** outputs, \
-        size_t                        output_num, \
-        const vsi_nn_kernel_param_t * params, \
-        vsi_nn_kernel_t             * kernel \
-        ) \
-    { \
-        return _setup(graph, inputs, input_num, outputs, output_num, \
-                params, kernel, UNARY_TYPE); \
-    } \
-    REGISTER_BACKEND_CPU( KERNEL_NAME, _##KERNEL_NAME##_setup )
-
-__END_DECLS
-
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( sin,          UNARY_SIN )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( cos,          UNARY_COS )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( exp,          UNARY_EXP )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( log,          UNARY_LOG )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg,          UNARY_NEG )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish,         UNARY_MISH )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( round,        UNARY_ROUND )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu,         UNARY_GELU )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu,    UNARY_HGELU )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( selu,         UNARY_SELU )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu,         UNARY_CELU )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( rcp,          UNARY_RCP )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( sign,         UNARY_SIGN )
-REGISTER_ELTWISE_UNARY_BACKEND_CPU( softsign,     UNARY_SOFTSIGN )
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
deleted file mode 100644
index cf427f7..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.erf")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _erf_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _ERF_PARAM_NUM  _cnt_of_array( _erf_kernel_param_def )
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    size_t i = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-    for (i = 0; i < out_elements[0]; i ++)
-    {
-        float x = vsi_nn_erf_impl(f32_in_buffer[0][i]);
-        f32_out_buffer[0][i] = x;
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-            f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _erf_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _erf_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-
-
-    status = _query_kernel( kernel, inputs, outputs);
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( erf, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c b/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c
deleted file mode 100644
index 371aead..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c
+++ /dev/null
@@ -1,187 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.extra_ending")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _extra_ending_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _EXTRA_ENDING_PARAM_NUM  _cnt_of_array( _extra_ending_kernel_param_def )
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    uint8_t *u8_in_buffer[_INPUT_NUM] = {NULL};
-    uint8_t *u8_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    int32_t i = 0;
-
-    /* prepare data */
-    input[1] = (vsi_nn_kernel_tensor_t)param[1];
-    in_attr[1] = vsi_nn_kernel_tensor_attr_create( input[1] );
-    u8_in_buffer[1] = (uint8_t*)vsi_nn_kernel_tensor_create_buffer( input[1], in_attr[1], FALSE );
-    CHECK_PTR_FAIL_GOTO( u8_in_buffer[i], "Create input buffer fail.", final );
-
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(uint8_t);
-        u8_out_buffer[i] = (uint8_t *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( u8_out_buffer[i], "Create output buffer fail.", final );
-        memset( u8_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    memcpy(u8_out_buffer[0], u8_in_buffer[1], out_bytes[0]);
-
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
-            u8_out_buffer[i], out_bytes[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        vsi_nn_safe_free(u8_in_buffer[i]);
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        vsi_nn_safe_free(u8_out_buffer[i]);
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _extra_ending_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _extra_ending_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_EXTRA_ENDING_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _EXTRA_ENDING_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _EXTRA_ENDING_PARAM_NUM );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( extra_ending, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c b/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c
deleted file mode 100644
index 99ca050..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.floordiv")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _floordiv_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _FLOORDIV_PARAM_NUM  _cnt_of_array( _floordiv_kernel_param_def )
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for( i = 0; i < rank && index; i ++ )
-    {
-        if( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-    return offset;
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        vsi_ssize_t  in0_offset = 0;
-        vsi_ssize_t  in1_offset = 0;
-        float    in0 = 0;
-        float    in1 = 0;
-
-        in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size,
-                in_stride_size[0], out_attr[0]->shape->data );
-        in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size,
-                in_stride_size[1], out_attr[0]->shape->data );
-        in0 = f32_in_buffer[0][in0_offset];
-        in1 = f32_in_buffer[1][in1_offset];
-        f32_out_buffer[0][i] = (float)floor(in0 / in1);
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _floordiv_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _floordiv_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs);
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _FLOORDIV_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _FLOORDIV_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( floordiv, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
deleted file mode 100644
index aa02a41..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (4)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.gather")
-
-DEF_KERNEL_EXECUTOR(_gather_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[2] = { NULL };
-    uint32_t* buffer_idx = NULL;
-    size_t in_elements = 0, out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t i = 0, j = 0, b = 0;
-    int32_t block_size = 1, block_num = 1, axis_num = 0, batch_dims = 0;
-    vsi_size_t indices_num = 1, batch = 1, in_stride = 1, out_stride = 1;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    in_elements = vsi_nn_kernel_tensor_attr_get_size( attr[0] );
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis_num);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &batch_dims);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer_idx = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    {
-        for (i = 0; i < attr[1]->shape->size - (vsi_size_t)batch_dims; i++)
-        {
-            indices_num *= attr[1]->shape->data[i];
-        }
-
-        for (; i < attr[1]->shape->size; i++)
-        {
-            batch *= attr[1]->shape->data[i];
-        }
-
-        for (i = 0; i < attr[0]->shape->size - (vsi_size_t)batch_dims; i++)
-        {
-            in_stride *= attr[0]->shape->data[i];
-        }
-
-        for (i = 0; i < attr[2]->shape->size - (vsi_size_t)batch_dims; i++)
-        {
-            out_stride *= attr[2]->shape->data[i];
-        }
-
-        for (b = 0; b < batch; b++)
-        {
-            for (i = 0; i < (vsi_size_t)block_num; i++)
-            {
-                for (j = 0; j < indices_num; j++)
-                {
-                    uint32_t indice = buffer_idx[j + indices_num * b];
-                    vsi_size_t in_index = (i * axis_num + indice) * block_size + b * in_stride;
-                    if (in_index < in_elements)
-                    {
-                        vsi_size_t out_index = (i * indices_num + j) * block_size + b * out_stride;
-                        memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
-                    }
-                    else
-                    {
-                        status = VX_FAILURE;
-                        CHECK_STATUS_FAIL_GOTO( status, final );
-                    }
-                }
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if ( buffer_idx )
-    {
-        free( buffer_idx );
-    }
-    for ( i = 0; i < 2; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for ( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _gather_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _gather_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _GATHER_PARAM_NUM  _cnt_of_array( _gather_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _gather_exec;
-    kernel->info.parameters  = _gather_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _gather_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            uint32_t index = 3;
-            int32_t block_size   = vsi_nn_kernel_param_get_int32( params, "block_size" );
-            int32_t block_num    = vsi_nn_kernel_param_get_int32( params, "block_num" );
-            int32_t axis_num     = vsi_nn_kernel_param_get_int32( params, "axis_num" );
-            int32_t batch_dims   = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch_dims );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-            vsi_nn_kernel_scalar_release( &backend_params[6] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( gather, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_elements_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_elements_cpu.c
deleted file mode 100644
index 65778e5..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/gather_elements_cpu.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _ARG_NUM            (1)
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.gather_elements")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _gather_elements_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _GATHER_ELEMENTS_PARAM_NUM  _cnt_of_array( _gather_elements_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[2] = { NULL };
-    int32_t* buffer_idx = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t a = 0;
-    vsi_size_t o = 0;
-    vsi_size_t i = 0;
-    vsi_size_t outer_size[2] = {1, 1};
-    vsi_size_t inner_size[2] = {1, 1};
-    vsi_size_t axis_size[2] = {1, 1};
-    int32_t axis = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer_idx = (int32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    axis_size[0] = attr[0]->shape->data[axis];
-    axis_size[1] = attr[2]->shape->data[axis];
-    for (i = 0; i < (vsi_size_t)axis; ++i)
-    {
-        inner_size[0] *= attr[0]->shape->data[i];
-        inner_size[1] *= attr[2]->shape->data[i];
-    }
-
-    for (i = axis + 1; i < attr[2]->shape->size; ++i)
-    {
-        outer_size[0] *= attr[0]->shape->data[i];
-        outer_size[1] *= attr[2]->shape->data[i];
-    }
-
-    for (o = 0; o < outer_size[1]; o++)
-    {
-        for (a = 0; a < axis_size[1]; a++)
-        {
-            for (i = 0; i < inner_size[1]; i++)
-            {
-                vsi_ssize_t index = 0;
-                vsi_size_t index0 = (o * axis_size[1] + a) * inner_size[1] + i;
-                vsi_size_t index1 = 1;
-
-                index = (vsi_ssize_t)buffer_idx[index0];
-                index = index < 0 ? index + (vsi_ssize_t)axis_size[0] : index;
-                index1 = (o * axis_size[0] + index) * inner_size[0] + i;
-
-                buffer[1][index0] = buffer[0][index1];
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-final:
-    if ( buffer_idx )
-    {
-        free( buffer_idx );
-    }
-    for ( i = 0; i < 2; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for ( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _gather_elements_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _gather_elements_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_GATHER_ELEMENTS_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _GATHER_ELEMENTS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_ELEMENTS_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[3] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( gather_elements, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
deleted file mode 100644
index d57cfd4..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (2)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.gather_nd")
-
-DEF_KERNEL_EXECUTOR(_gather_nd_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[2] = { NULL };
-    uint32_t* buffer_idx = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t i = 0;
-    int32_t block_size = 1;
-    vsi_ssize_t indices_num = 1;
-    int32_t coord_stride = 1;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &(block_size));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(coord_stride));
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer_idx = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    // index number
-    for(i = 0; i < (int32_t)attr[1]->shape->size; ++i)
-    {
-        indices_num *= attr[1]->shape->data[i];
-    }
-    indices_num /= coord_stride;
-
-    if(coord_stride <= 4) // reshape 3D
-    {
-        vsi_ssize_t stride[4] = {block_size, 0, 0, 0};
-        int32_t     start_dim = (int32_t)attr[0]->shape->size - coord_stride;
-        for(i = 1; i < coord_stride; ++i)
-        {
-            stride[i] = stride[i - 1] * attr[0]->shape->data[start_dim + i - 1];
-        }
-
-        for(i = 0; i < indices_num; i++)
-        {
-            vsi_size_t out_index = i * block_size;
-            uint32_t coord[4] = {0};
-            vsi_size_t in_index = 0;
-            int32_t j = 0;
-
-            for(j = 0; j < coord_stride; j++)
-            {
-                coord[j] = buffer_idx[i * coord_stride + j];
-                in_index += coord[j] * stride[j];
-            }
-            memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
-        }
-    }
-    else
-    {
-        status = VSI_FAILURE;
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if( buffer_idx )
-    {
-        free( buffer_idx );
-    }
-    for( i = 0; i < 2; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_yuv420_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _gather_nd_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _GATHER_ND_PARAM_NUM  _cnt_of_array( _gather_nd_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _gather_nd_exec;
-    kernel->info.parameters  = _gather_nd_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _gather_nd_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
-    int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 3;
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( gather_nd, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c b/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c
deleted file mode 100644
index 86e0c7e..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c
+++ /dev/null
@@ -1,504 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (4)
-#define _OUTPUT_NUM         (3)
- #define _TENSOR_NUM        (_INPUT_NUM + _OUTPUT_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.generate_proposals")
-
-
-typedef struct vsi_nn_box_encoding_corner_t
-{
-    float x1, y1, x2, y2;
-}vsi_nn_box_encoding_corner;
-
-typedef struct vsi_nn_box_encoding_center_t
-{
-    float w, h, x, y;
-}vsi_nn_box_encoding_center;
-/*
- * Kernel params
- */
-static vx_param_description_t _generate_proposals_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _GENERATE_PROPOSALS_PARAM_NUM  _cnt_of_array( _generate_proposals_kernel_param_def )
-
-
-static void _to_box_encoding_corner
-    (
-    vsi_nn_box_encoding_center* ctr,
-    vsi_nn_box_encoding_corner* cnr
-    )
-{
-    cnr->x1 = ctr->x - ctr->w / 2;
-    cnr->y1 = ctr->y - ctr->h / 2;
-    cnr->x2 = ctr->x + ctr->w / 2;
-    cnr->y2 = ctr->y + ctr->h / 2;
-}
-
-static void _to_box_encoding_center
-    (
-    vsi_nn_box_encoding_corner* cnr,
-    vsi_nn_box_encoding_center* ctr
-    )
-{
-    ctr->w = cnr->x2 - cnr->x1;
-    ctr->h = cnr->y2 - cnr->y1;
-    ctr->x = (cnr->x1 + cnr->x2) / 2;
-    ctr->y = (cnr->y1 + cnr->y2) / 2;
-}
-
-static void _iota
-    (
-    int32_t * data,
-    uint32_t len,
-    int32_t value
-    )
-{
-    uint32_t i;
-    for (i = 0; i < len; i++)
-    {
-        data [i] = value;
-        value++;
-    }
-}
-
-// swap_element is implemented in vsi_nn_kernel_box_with_nms_limit.c
-void swap_element
-    (
-    uint32_t* list,
-    uint32_t first,
-    uint32_t second
-    );
-
-// max_element is implemented in vsi_nn_kernel_box_with_nms_limit.c
-uint32_t max_element
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    );
-
-// getIoUAxisAligned is implemented in vsi_nn_kernel_box_with_nms_limit.c
-float getIoUAxisAligned
-    (
-    const float* roi1,
-    const float* roi2
-    );
-
-// sort_element_by_score is implemented in vsi_nn_kernel_box_with_nms_limit.c
-void sort_element_by_score
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    );
-
-void _filter_boxes
-    (
-    const float* roiBase,
-    const float* imageInfoBase,
-    float minSize,
-    uint32_t* select,
-    uint32_t* len
-    )
-{
-    const uint32_t kRoiDim = 4;
-    uint32_t i = 0;
-    uint32_t j = 0;
-
-    for (j = 0; j < *len; j++)
-    {
-        const float* roiInfo = roiBase + select[j] * kRoiDim;
-        float roiWidth, roiHeight, xRoiCenter, yRoiCenter;
-        roiWidth = roiInfo[2] - roiInfo[0];
-        roiHeight = roiInfo[3] - roiInfo[1];
-        xRoiCenter = roiInfo[0] + roiWidth / 2.0f;
-        yRoiCenter = roiInfo[1] + roiHeight / 2.0f;
-        if (roiWidth > minSize && roiHeight > minSize && xRoiCenter < imageInfoBase[1]
-            && yRoiCenter < imageInfoBase[0])
-        {
-            select[i] = select[j];
-            i++;
-        }
-    }
-    *len = i;
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    float heightStride;
-    float widthStride;
-    int32_t preNmsTopN;
-    int32_t postNmsTopN;
-    float iouThreshold;
-    float minSize;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM], &heightStride );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM + 1], &widthStride );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[_TENSOR_NUM + 2], &preNmsTopN );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( param[_TENSOR_NUM + 3], &postNmsTopN );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM + 4], &iouThreshold );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM + 5], &minSize );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    {
-        uint32_t h, w, a, b, j;
-        const uint32_t kRoiDim = 4;
-        vsi_size_t numBatches = in_attr[0]->shape->data[3];
-        vsi_size_t height = in_attr[0]->shape->data[2];
-        vsi_size_t width = in_attr[0]->shape->data[1];
-        vsi_size_t numAnchors = in_attr[0]->shape->data[0];
-        vsi_size_t imageInfoLength = in_attr[3]->shape->data[0];
-
-        vsi_size_t batchSize = height * width * numAnchors;
-        vsi_size_t roiBufferSize = batchSize * kRoiDim;
-
-        float * roiBuffer = (float*)malloc(roiBufferSize * sizeof(float));
-        float * roiTransformedBuffer = (float*)malloc(roiBufferSize * sizeof(float));
-        uint32_t* select = (uint32_t*)malloc(batchSize * sizeof(uint32_t));
-        uint32_t index = 0;
-        vsi_size_t scores_index = 0;
-        vsi_size_t bboxDeltas_index = 0;
-        vsi_size_t imageInfo_index = 0;
-        uint32_t scores_out_index = 0;
-        uint32_t roi_out_index = 0;
-
-        // Compute the roi region for each anchor.
-        for(h = 0; h < height; h++)
-        {
-            float hShift = h * heightStride;
-            for(w = 0; w < width; w++)
-            {
-                float wShift = w * widthStride;
-                uint32_t anchor_index = 0;
-                for(a = 0; a < numAnchors; a++)
-                {
-                    roiBuffer[index] = f32_in_buffer[2][anchor_index] + wShift;
-                    roiBuffer[index + 1] = f32_in_buffer[2][anchor_index + 1] + hShift;
-                    roiBuffer[index + 2] = f32_in_buffer[2][anchor_index + 2] + wShift;
-                    roiBuffer[index + 3] = f32_in_buffer[2][anchor_index + 3] + hShift;
-
-                    index += kRoiDim;
-                    anchor_index += kRoiDim;
-                }
-            }
-        }
-
-        for (b = 0; b < numBatches; b++)
-        {
-            const uint32_t roiLength = 4;
-
-            vsi_size_t numRois = batchSize;
-            vsi_size_t roiIndex;
-            uint32_t select_len;
-            int32_t numDetections = 0;
-            for (roiIndex = 0; roiIndex < numRois; roiIndex++)
-            {
-                float imageHeight = f32_in_buffer[3][imageInfo_index];
-                float imageWidth = f32_in_buffer[3][imageInfo_index + 1];
-                vsi_nn_box_encoding_corner roi_cnr;
-                vsi_nn_box_encoding_center roiBefore;
-                roi_cnr.x1 = roiBuffer[roiIndex * roiLength];
-                roi_cnr.y1 = roiBuffer[roiIndex * roiLength + 1];
-                roi_cnr.x2 = roiBuffer[roiIndex * roiLength + 2];
-                roi_cnr.y2 = roiBuffer[roiIndex * roiLength + 3];
-                _to_box_encoding_center(&roi_cnr, &roiBefore);
-                {
-                    vsi_nn_box_encoding_center roi_ctr;
-                    vsi_nn_box_encoding_corner roiAfter;
-                    vsi_nn_box_encoding_corner cliped;
-                    vsi_size_t idx = bboxDeltas_index + roiIndex * roiLength;
-                    roi_ctr.w = (float)(exp(f32_in_buffer[1][idx + 2]) * roiBefore.w);
-                    roi_ctr.h = (float)(exp(f32_in_buffer[1][idx + 3]) * roiBefore.h);
-                    roi_ctr.x = roiBefore.x + f32_in_buffer[1][idx] * roiBefore.w;
-                    roi_ctr.y = roiBefore.y + f32_in_buffer[1][idx + 1] * roiBefore.h;
-                    _to_box_encoding_corner(&roi_ctr, &roiAfter);
-                    cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth);
-                    cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight);
-                    cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth);
-                    cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight);
-                    roiTransformedBuffer[idx] = cliped.x1;
-                    roiTransformedBuffer[idx + 1] = cliped.y1;
-                    roiTransformedBuffer[idx + 2] = cliped.x2;
-                    roiTransformedBuffer[idx + 3] = cliped.y2;
-                }
-            }
-
-            // Find the top preNmsTopN scores.
-            _iota((int32_t*)select, (uint32_t)batchSize, 0);
-            select_len = (uint32_t)batchSize;
-            if(preNmsTopN > 0 && preNmsTopN < (int32_t)batchSize)
-            {
-                sort_element_by_score(&(f32_in_buffer[0][scores_index]),
-                    select, (uint32_t)batchSize);
-                select_len = preNmsTopN;
-            }
-
-            // Filter boxes, disgard regions with height or width < minSize.
-            _filter_boxes(roiTransformedBuffer, &(f32_in_buffer[3][0]),
-                minSize, select, &select_len);
-
-            // Apply hard NMS.
-            if (postNmsTopN < 0)
-            {
-                postNmsTopN = select_len;
-            }
-
-            for (j = 0; (j < select_len && numDetections < postNmsTopN); j++)
-            {
-                // find max score and swap to the front.
-                int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]),
-                    &(select[j]), select_len - j) + j;
-                swap_element(select, max_index, j);
-
-                // Calculate IoU of the rest, swap to the end (disgard) ifneeded.
-                for (i = j + 1; i < select_len; i++)
-                {
-                    int32_t roiBase0 = select[i] * kRoiDim;
-                    int32_t roiBase1 = select[j] * kRoiDim;
-                    float iou = getIoUAxisAligned(&(roiTransformedBuffer[roiBase0]),
-                        &(roiTransformedBuffer[roiBase1]));
-
-                    if (iou >= iouThreshold)
-                    {
-                        swap_element(select, i, select_len - 1);
-                        i--;
-                        select_len--;
-                    }
-                }
-                numDetections++;
-            }
-
-            for (i = 0; i < select_len; i++)
-            {
-                memcpy(&(f32_out_buffer[1][roi_out_index]),
-                    &(roiTransformedBuffer[select[i] * kRoiDim]), kRoiDim * sizeof(float));
-                f32_out_buffer[0][scores_out_index] =
-                    f32_in_buffer[0][scores_index + select[i]];
-                f32_out_buffer[2][scores_out_index] = (float)b;
-                scores_out_index++;
-                roi_out_index += kRoiDim;
-            }
-
-            scores_index += batchSize;
-            bboxDeltas_index += roiBufferSize;
-            imageInfo_index += imageInfoLength;
-        }
-
-        vsi_nn_safe_free(roiBuffer);
-        vsi_nn_safe_free(roiTransformedBuffer);
-        vsi_nn_safe_free(select);
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _generate_proposals_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _generate_proposals_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_GENERATE_PROPOSALS_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    float height_stride = vsi_nn_kernel_param_get_float32( params, "height_stride");
-    float width_stride = vsi_nn_kernel_param_get_float32( params, "width_stride");
-    int32_t pre_nms_top_n = vsi_nn_kernel_param_get_int32( params, "pre_nms_top_n");
-    int32_t post_nms_top_n = vsi_nn_kernel_param_get_int32( params, "post_nms_top_n");
-    float iou_threshold = vsi_nn_kernel_param_get_float32(params, "iou_threshold");
-    float min_size = vsi_nn_kernel_param_get_float32(params, "min_size");
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _GENERATE_PROPOSALS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[_TENSOR_NUM    ] = vsi_nn_kernel_scalar_create( graph, F32, &height_stride );
-            node_params[_TENSOR_NUM + 1] = vsi_nn_kernel_scalar_create( graph, F32, &width_stride );
-            node_params[_TENSOR_NUM + 2] = vsi_nn_kernel_scalar_create( graph, I32, &pre_nms_top_n );
-            node_params[_TENSOR_NUM + 3] = vsi_nn_kernel_scalar_create( graph, I32, &post_nms_top_n );
-            node_params[_TENSOR_NUM + 4] = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold );
-            node_params[_TENSOR_NUM + 5] = vsi_nn_kernel_scalar_create( graph, F32, &min_size );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GENERATE_PROPOSALS_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM    ] );
-            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 1] );
-            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 2] );
-            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 3] );
-            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 4] );
-            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 5] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( generate_proposals, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
deleted file mode 100644
index 82b2482..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
+++ /dev/null
@@ -1,303 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (2)
-#define _CPU_INPUT_NUM          (3)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.group_norm")
-
-DEF_KERNEL_EXECUTOR(_group_norm_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t spaceOrg = 0;
-    float eps = .0f;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &spaceOrg);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final );
-
-    buffer[3] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
-    memset( buffer[3], 0, out_elements * sizeof(float) );
-
-    {
-        vsi_size_t b = 0, c = 0;
-        vsi_size_t height = attr[0]->shape->data[1];
-        vsi_size_t width = attr[0]->shape->data[0];
-        vsi_size_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
-        vsi_size_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
-        vsi_size_t spatial = height * width;
-
-        for (b = 0; b < bh; b++)
-        {
-            for (c = 0; c < ch; c++)
-            {
-                vsi_size_t page = c * spatial + b * (spatial * ch);
-                vsi_size_t paraIdx = c * attr[1]->shape->data[0];
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-                float data = 0;
-
-                for (i = 0; i < spatial; i++)
-                {
-                    vsi_size_t index = page + i;
-                    sum += buffer[0][index];
-                }
-
-                mean = sum / spatial;
-                for (i = 0; i < spatial; i++)
-                {
-                    vsi_size_t index = page + i;
-                    data = buffer[0][index] - mean;
-                    sumsq += data * data;
-                }
-
-                vari = sumsq / spatial;
-                vari = (float)(1.0 / sqrtf(vari + eps));
-
-                for (i = 0; i < spatial; i++)
-                {
-                    float normVal = 0;
-                    vsi_size_t index = page + i;
-                    vsi_size_t tmpIdx = paraIdx + i / spaceOrg;
-                    float scaleVal = buffer[2][tmpIdx];
-                    float biasVal = buffer[1][tmpIdx];
-
-                    data = buffer[0][index] - mean;
-                    normVal = data * vari * scaleVal + biasVal;
-                    buffer[3][index] = normVal;
-                }
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            buffer[3], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _group_norm_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _group_normalization_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _GROUP_NORMALIZATION_PARAM_NUM  _cnt_of_array( _group_normalization_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _group_norm_exec;
-    kernel->info.parameters  = _group_normalization_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _group_normalization_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static int32_t _optimize_gn_shape_cpu
-    (
-    vsi_nn_tensor_t ** inputs,
-    vsi_size_t group_size,
-    int32_t group_num,
-    vsi_size_t* opt_shape
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
-    vsi_size_t new_rank = 0;
-    group_shape[0] = inputs[0]->attr.size[0];
-    group_shape[1] = inputs[0]->attr.size[1];
-    group_shape[2] = group_size;
-
-    vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
-
-    if (new_rank == 2)
-    {
-        opt_shape[2] = group_num;
-        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-    }
-    else
-    {
-        status = VSI_FAILURE;
-    }
-
-    return status;
-}
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
-    vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
-    int32_t group_num  = vsi_nn_kernel_param_get_int32( params, "group_num" );
-    vsi_size_t group_size  = inputs[0]->attr.size[2] / group_num;
-    int32_t spaceOrg = (int32_t)(inputs[0]->attr.size[0] * inputs[0]->attr.size[1]);
-
-    status = _optimize_gn_shape_cpu(inputs, group_size, group_num, new_shape);
-    if ( VSI_SUCCESS != status )
-    {
-        goto final;
-    }
-    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
-    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-            uint32_t index = 0;
-            /* Set inputs and outputs */
-            backend_params[index++] = rs_input;
-            backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
-            backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
-            backend_params[index++] = rs_output;
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &spaceOrg );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-final:
-    if (rs_input)
-    {
-        vsi_nn_kernel_tensor_release( &rs_input );
-    }
-    if (rs_output)
-    {
-        vsi_nn_kernel_tensor_release( &rs_output );
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( group_norm, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c
deleted file mode 100644
index 1468b26..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c
+++ /dev/null
@@ -1,516 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.grucell_activation")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _grucell_activation_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _GRUCELL_ACTIVATION_PARAM_NUM  _cnt_of_array( _grucell_activation_kernel_param_def )
-
-#define _IO_COUNT_DEFAULT       (5)
-
-static vx_param_description_t _grucell_activation_separated_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _GRUCELL_ACTIVATION_SEPARATED_PARAM_NUM  _cnt_of_array( _grucell_activation_separated_kernel_param_def )
-#define _IO_COUNT_SEPARATED (15)
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_ssize_t i = 0;
-    vsi_ssize_t batch = 0;
-    vsi_ssize_t hidden_units = 0;
-    float * buffer[_IO_COUNT_DEFAULT] = { NULL };
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_activation_e gate_activation;
-    vsi_nn_activation_e candidate_activation;
-    vsi_nn_kernel_tensor_t tensors[_IO_COUNT_DEFAULT] = { NULL };
-    vsi_nn_kernel_tensor_attr_t* attr[_IO_COUNT_DEFAULT] = { NULL };
-
-    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
-    tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
-    tensors[4] = (vsi_nn_kernel_tensor_t)param[4];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    attr[4] = vsi_nn_kernel_tensor_attr_create( tensors[4] );
-
-    /* z{t_} */
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
-    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input buffer fail.", final );
-    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create input buffer fail.", final );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &gate_activation);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &candidate_activation);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-
-    batch = attr[0]->shape->data[1];
-    hidden_units = attr[0]->shape->data[0];
-
-    for( i = 0; i < batch * hidden_units; i++ )
-    {
-        float zt = vsi_nn_activation(buffer[0][i], gate_activation);
-        float ht_ = vsi_nn_activation(buffer[1][i], candidate_activation);
-        float ht_1 = buffer[2][i];
-        float ht = zt * (ht_1 - ht_) + ht_;
-
-        buffer[3][i] = ht;
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            buffer[3], batch * hidden_units );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[4], attr[4],
-            buffer[3], batch * hidden_units );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < 5; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _compute() */
-
-DEF_KERNEL_EXECUTOR(_compute_separated)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_ssize_t i = 0, j = 0;
-    vsi_ssize_t batch = 0;
-    vsi_ssize_t hidden_units = 0;
-    float * buffer[_IO_COUNT_SEPARATED] = { NULL };
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_activation_e gate_activation;
-    vsi_nn_activation_e candidate_activation;
-    vsi_bool use_cudnn_implementation;
-    grucell_activation_input_layout_e input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC;
-    vsi_nn_kernel_tensor_t tensors[_IO_COUNT_SEPARATED] = { NULL };
-    vsi_nn_kernel_tensor_attr_t* attr[_IO_COUNT_SEPARATED] = { NULL };
-    float *i_r_base = NULL, *i_c_base = NULL, *i_u_base = NULL;
-    float *r_r_base = NULL, *r_u_base = NULL, *r_c_base = NULL;
-    float cond_reset = 0.f, cond_update = 0.f, cond_candidate = 0.f;
-    float i_r = 0.f, i_u = 0.f, i_c = 0.f, r_r = 0.f, r_u = 0.f, r_c = 0.f;
-    float bias_r = 0.f, bias_u = 0.f, bias_c = 0.f;
-    float r = 0.f, u = 0.f, c = 0.f, state = 0.f;
-
-    for(i = 0; i < _IO_COUNT_SEPARATED; i++)
-    {
-        tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
-        attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
-    }
-
-    /* z{t_} */
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
-    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE );
-
-    buffer[4] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[4], attr[4], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[4], "Create input buffer fail.", final );
-    buffer[5] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[5], attr[5], TRUE );
-    buffer[6] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[6], attr[6], TRUE );
-
-    buffer[7] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[7], attr[7], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[7], "Create input buffer fail.", final );
-    buffer[8] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[8], attr[8], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[8], "Create input buffer fail.", final );
-    buffer[9] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[9], attr[9], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[9], "Create input buffer fail.", final );
-
-    buffer[10] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[10], attr[10], TRUE );
-    buffer[11] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[11], attr[11], TRUE );
-    buffer[12] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[12], attr[12], TRUE );
-
-    buffer[13] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[13], attr[13], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[13], "Create input buffer fail.", final );
-    buffer[14] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[14], attr[14], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[14], "Create input buffer fail.", final );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &gate_activation);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[16], &candidate_activation);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[17], &use_cudnn_implementation);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[18], &input_layout);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-
-    if(GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC == input_layout)
-    {
-        batch = attr[1]->shape->data[1];
-        hidden_units = attr[1]->shape->data[0];
-
-        if(buffer[2] == NULL)
-        {
-            hidden_units = hidden_units / 3;
-        }
-
-        for( i = 0; i < batch; i++ )
-        {
-            float* input_base = buffer[0] + i * hidden_units;
-            float* output_base = buffer[13] + i * hidden_units;
-
-            if(buffer[2] == NULL)
-            {
-                float* input_fc_base = buffer[1] + i * hidden_units * 3;
-                float* recurrent_fc_base = buffer[4] + i * hidden_units * 3;
-
-                i_r_base = input_fc_base + 0 * hidden_units;
-                i_u_base = input_fc_base + 1 * hidden_units;
-                i_c_base = input_fc_base + 2 * hidden_units;
-
-                r_r_base = recurrent_fc_base + 0 * hidden_units;
-                r_u_base = recurrent_fc_base + 1 * hidden_units;
-                r_c_base = recurrent_fc_base + 2 * hidden_units;
-            }
-            else
-            {
-                i_r_base = buffer[1] + i * hidden_units;
-                i_u_base = buffer[2] + i * hidden_units;
-                i_c_base = buffer[3] + i * hidden_units;
-                r_r_base = buffer[4] + i * hidden_units;
-                r_u_base = buffer[5] + i * hidden_units;
-                r_c_base = buffer[6] + i * hidden_units;
-            }
-
-            for( j = 0; j < hidden_units; j++ )
-            {
-                cond_reset = buffer[10] ? buffer[10][j] : cond_reset;
-                cond_update = buffer[11] ? buffer[11][j] : cond_update;
-                cond_candidate = buffer[12] ? buffer[12][j] : cond_candidate;
-
-                bias_r = buffer[7][j];
-                bias_u = buffer[8][j];
-                bias_c = buffer[9][j];
-
-                i_r = i_r_base[j];
-                i_u = i_u_base[j];
-                i_c = i_c_base[j];
-
-                r_r = r_r_base[j];
-                r_u = r_u_base[j];
-                r_c = r_c_base[j];
-
-                r = vsi_nn_activation(i_r + cond_reset + r_r + bias_r, gate_activation);
-                u = vsi_nn_activation(i_u + cond_update + r_u + bias_u, gate_activation);
-                c = vsi_nn_activation(i_c + cond_candidate + r * (r_c + bias_c), candidate_activation);
-                state = u * (input_base[j] - c) + c;
-
-                output_base[j] = state;
-            }
-        }
-    }
-    else
-    {
-        vsi_bool input_transposed = FALSE;
-        float* input_base = buffer[0];
-        float* output_base = buffer[13];
-        float* curr_input = NULL;
-        float* curr_output = NULL;
-
-        batch = attr[1]->shape->data[0];
-        hidden_units = attr[1]->shape->data[1];
-
-        if(buffer[2] == NULL)
-        {
-            hidden_units = hidden_units / 3;
-            i_r_base = buffer[1] + 0 * hidden_units * batch;
-            i_u_base = buffer[1] + 1 * hidden_units * batch;
-            i_c_base = buffer[1] + 2 * hidden_units * batch;
-            r_r_base = buffer[4] + 0 * hidden_units * batch;
-            r_u_base = buffer[4] + 1 * hidden_units * batch;
-            r_c_base = buffer[4] + 2 * hidden_units * batch;
-        }
-        else
-        {
-            i_r_base = buffer[1];
-            i_u_base = buffer[2];
-            i_c_base = buffer[3];
-            r_r_base = buffer[4];
-            r_u_base = buffer[5];
-            r_c_base = buffer[6];
-        }
-
-        if(GRUCELL_ACTIVATION_INPUT_LAYOUT_INPUT_NC_FC_CN == input_layout)
-        {
-            input_transposed = FALSE;
-        }
-        else
-        {
-            input_transposed = TRUE;
-        }
-
-        for( i = 0; i < hidden_units; i++ )
-        {
-            cond_reset = buffer[10] ? buffer[10][i] : cond_reset;
-            cond_update = buffer[11] ? buffer[11][i] : cond_update;
-            cond_candidate = buffer[12] ? buffer[12][i] : cond_candidate;
-            bias_r = buffer[7][i];
-            bias_u = buffer[8][i];
-            bias_c = buffer[9][i];
-
-            for( j = 0; j < batch; j++ )
-            {
-                if(input_transposed)
-                {
-                    curr_input = &input_base[i * batch + j];
-                    curr_output = &output_base[i * batch + j];
-                }
-                else
-                {
-                    curr_input = &input_base[j * hidden_units + i];
-                    curr_output = &output_base[j * hidden_units + i];
-                }
-
-                i_r = i_r_base[i * batch + j];
-                i_u = i_u_base[i * batch + j];
-                i_c = i_c_base[i * batch + j];
-                r_r = r_r_base[i * batch + j];
-                r_u = r_u_base[i * batch + j];
-                r_c = r_c_base[i * batch + j];
-
-                r = vsi_nn_activation(i_r + cond_reset + r_r + bias_r, gate_activation);
-                u = vsi_nn_activation(i_u + cond_update + r_u + bias_u, gate_activation);
-                c = vsi_nn_activation(i_c + cond_candidate + r * (r_c + bias_c), candidate_activation);
-                state = u * (*curr_input - c) + c;
-
-                *curr_output = state;
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[13], attr[13],
-            buffer[13], batch * hidden_units );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[14], attr[14],
-            buffer[13], batch * hidden_units );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _IO_COUNT_SEPARATED; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _compute() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs,
-    int32_t gate_activation,
-    int32_t candidate_activation,
-    int32_t input_category,
-    vsi_bool use_cudnn_implementation,
-    int32_t* param_count,
-    int32_t* input_count,
-    int32_t* output_count
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    if(input_category == 0)
-    {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-        kernel->info.function    = _compute;
-        kernel->info.parameters  = _grucell_activation_kernel_param_def;
-        kernel->info.numParams   = _cnt_of_array( _grucell_activation_kernel_param_def );
-        *param_count = _GRUCELL_ACTIVATION_PARAM_NUM;
-        *input_count = 3;
-        *output_count = 2;
-        status = VSI_SUCCESS;
-    }
-    else
-    {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-        kernel->info.function    = _compute_separated;
-        kernel->info.parameters  = _grucell_activation_separated_kernel_param_def;
-        kernel->info.numParams   = _cnt_of_array( _grucell_activation_separated_kernel_param_def );
-        *param_count = _GRUCELL_ACTIVATION_SEPARATED_PARAM_NUM;
-        *input_count = 13;
-        *output_count = 2;
-        status = VSI_SUCCESS;
-    }
-    return status;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t* node_params = NULL;
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t i = 0;
-    int32_t j = 0;
-    int32_t param_count = 0;
-    int32_t input_count = 0;
-    int32_t output_count = 0;
-    int32_t gate_activation = vsi_nn_kernel_param_get_int32( params, "gate_activation" );
-    int32_t candidate_activation = vsi_nn_kernel_param_get_int32( params, "candidate_activation" );
-    int32_t input_category = vsi_nn_kernel_param_get_int32( params, "input_category" );
-    int32_t use_cudnn_implementation = vsi_nn_kernel_param_get_int32( params, "use_cudnn_implementation" );
-    grucell_activation_input_layout_e input_layout = vsi_nn_kernel_param_get_int32( params, "input_layout" );
-    vsi_nn_tensor_t** _inputs = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs, gate_activation, candidate_activation,
-        input_category, use_cudnn_implementation, &param_count, &input_count, &output_count );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            _inputs = (vsi_nn_tensor_t**)malloc(input_count * sizeof(vsi_nn_tensor_t**));
-            CHECK_PTR_FAIL_GOTO( _inputs, "Create buffer fail.", final );
-            node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
-            CHECK_PTR_FAIL_GOTO( node_params, "Create buffer fail.", final );
-            for(i = 0; i < input_count; i++)
-            {
-                _inputs[i] = inputs[i];
-            }
-
-            j = input_count + output_count;
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, param_count,
-                    _inputs, input_count, outputs, output_count );
-            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &gate_activation );
-            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &candidate_activation );
-            if(input_category != 0)
-            {
-                node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &use_cudnn_implementation );
-                node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &input_layout );
-            }
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, param_count );
-            if(input_category != 0)
-            {
-                vsi_nn_kernel_scalar_release( &node_params[--j] );
-                vsi_nn_kernel_scalar_release( &node_params[--j] );
-            }
-            vsi_nn_kernel_scalar_release( &node_params[--j] );
-            vsi_nn_kernel_scalar_release( &node_params[--j] );
-        }
-    }
-
-final:
-    vsi_nn_safe_free(_inputs);
-    vsi_nn_safe_free(node_params);
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( grucell_activation, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c
deleted file mode 100644
index 783f779..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (2)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.grucell_activation_sma")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _grucell_activation_sma_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _GRUCELL_ACTIVATION_SMA_PARAM_NUM  _cnt_of_array( _grucell_activation_sma_kernel_param_def )
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_size_t i = 0;
-    vsi_size_t batch = 0;
-    vsi_size_t hidden_units = 0;
-    float * buffer[_IO_NUM] = { NULL };
-    vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
-    vsi_nn_kernel_tensor_attr_t* attr[_IO_NUM] = { NULL };
-
-    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
-    tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
-    tensors[4] = (vsi_nn_kernel_tensor_t)param[4];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    attr[4] = vsi_nn_kernel_tensor_attr_create( tensors[4] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
-    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input buffer fail.", final );
-    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create input buffer fail.", final );
-
-    batch = attr[0]->shape->data[1];
-    hidden_units = attr[0]->shape->data[0];
-
-    for( i = 0; i < batch * hidden_units; i++ )
-    {
-        buffer[3][i] = (buffer[0][i] - buffer[1][i]) * buffer[2][i] + buffer[1][i];
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            buffer[3], batch * hidden_units );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[4], attr[4],
-            buffer[3], batch * hidden_units );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _compute() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _grucell_activation_sma_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _grucell_activation_sma_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_SMA_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_SMA_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_SMA_PARAM_NUM );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( grucell_activation_sma, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_z_h_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_z_h_cpu.c
deleted file mode 100644
index a5bd220..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_z_h_cpu.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (GRUCELL_ACT_Z_H_IN_CNT)
-#define _OUTPUT_NUM         (GRUCELL_ACT_Z_H_OUT_CNT)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.grucell_activation_z_h")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*activation*/
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*recurrent_activation*/
-};
-#define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM  _cnt_of_array( _grucell_activation_z_h_kernel_param_def )
-#define SCALAR_ACTIVATION          (7)
-#define SCALAR_R_ACTIVATION        (8)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM]   = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM]   = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM]    = {0};
-    vsi_size_t i, b;
-    int32_t  activation = 0;
-    int32_t  recurrent_activation = 0;
-    vsi_size_t n_batch               = 0;
-    vsi_size_t n_cell                = 0;
-
-    /* prepare data */
-    for ( i = 0; i < _INPUT_NUM; i++ )
-    {
-        input[i]   = (vsi_nn_kernel_tensor_t)param[i];
-        if (input[i])
-        {
-            in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-            vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
-            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-        }
-    }
-
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        output[i]   = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        if (output[i])
-        {
-            out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-            vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-            out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-            out_bytes[i] = out_elements[i] * sizeof(float);
-            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-            memset( f32_out_buffer[i], 0, out_bytes[i] );
-        }
-    }
-
-    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ACTIVATION], &activation );
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_R_ACTIVATION],
-        &recurrent_activation );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    n_cell  = in_attr[GRUCELL_ACT_Z_H_HSTATE]->shape->data[0];
-    n_batch = in_attr[GRUCELL_ACT_Z_H_HSTATE]->shape->data[1];
-
-    for (b = 0; b < n_batch; b ++)
-    {
-        for (i = 0; i < n_cell; i++)
-        {
-            vsi_size_t index = i + n_cell * b;
-            float data_z_t = 0;
-            float data_h_t = 0;
-            float hstate_in = f32_in_buffer[GRUCELL_ACT_Z_H_HSTATE][index];
-            float dst = 0;
-
-            data_z_t = f32_in_buffer[GRUCELL_ACT_Z_H_I_FC_Z][index];
-            data_z_t += f32_in_buffer[GRUCELL_ACT_Z_H_H_FC_Z][index];
-            data_z_t = vsi_nn_activation(data_z_t, recurrent_activation);
-
-            data_h_t = f32_in_buffer[GRUCELL_ACT_Z_H_I_FC_H][index];
-            data_h_t += f32_in_buffer[GRUCELL_ACT_Z_H_H_FC_H][index];
-            data_h_t = vsi_nn_activation(data_h_t, activation);
-
-            dst = (1 - data_z_t ) * data_h_t + data_z_t * hstate_in;
-
-            f32_out_buffer[GRUCELL_ACT_Z_H_OUT_OUTPUT][index] = dst;
-            f32_out_buffer[GRUCELL_ACT_Z_H_OUT_HSTATE][index] = dst;
-        }
-    }
-
-    /* save data */
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (output[i])
-        {
-            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                    f32_out_buffer[i], out_elements[i] );
-            CHECK_STATUS_FAIL_GOTO( status, final );
-        }
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _grucell_activation_z_h_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _grucell_activation_z_h_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_Z_H_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
-    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            node_params[SCALAR_ACTIVATION] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &activation );
-            node_params[SCALAR_R_ACTIVATION] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &recurrent_activation );
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ACTIVATION] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_R_ACTIVATION] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( grucell_activation_z_h, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_h_times_activation_r_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_h_times_activation_r_cpu.c
deleted file mode 100644
index b61f92e..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/grucell_h_times_activation_r_cpu.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.grucell_h_times_activation_r")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _grucell_h_times_activation_r_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*recurrent_activation*/
-    // Add kererl parameters here
-};
-#define _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM  _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def )
-#define SCALAR_R_ACTIVATION        (4)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM]   = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM]   = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM]    = {0};
-    vsi_size_t i, b;
-    int32_t  recurrent_activation = 0;
-    vsi_size_t n_batch               = 0;
-    vsi_size_t n_cell                = 0;
-
-    /* prepare data */
-    for( i = 0; i < _INPUT_NUM; i++ )
-    {
-        input[i]   = (vsi_nn_kernel_tensor_t)param[i];
-        if (input[i])
-        {
-            in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-            vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
-            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-        }
-    }
-
-    for( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        output[i]   = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        if (output[i])
-        {
-            out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-            vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-            out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-            out_bytes[i] = out_elements[i] * sizeof(float);
-            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-            memset( f32_out_buffer[i], 0, out_bytes[i] );
-        }
-    }
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_R_ACTIVATION],
-        &recurrent_activation );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    n_cell  = in_attr[0]->shape->data[0];
-    n_batch = in_attr[0]->shape->data[1];
-
-    for (b = 0; b < n_batch; b ++)
-    {
-        for (i = 0; i < n_cell; i++)
-        {
-            vsi_size_t index = i + n_cell * b;
-            float data_r_t = 0;
-            float r_times_h = 0;
-            float hstate_in = f32_in_buffer[0][index];
-
-            data_r_t = f32_in_buffer[1][index];
-            data_r_t += f32_in_buffer[2][index];
-
-            data_r_t = vsi_nn_activation(data_r_t, recurrent_activation);
-
-            r_times_h = hstate_in * data_r_t;
-
-            f32_out_buffer[0][index] = r_times_h;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (output[i])
-        {
-            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                    f32_out_buffer[i], out_elements[i] );
-            CHECK_STATUS_FAIL_GOTO( status, final );
-        }
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _grucell_h_times_activation_r_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_R_ACTIVATION] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &recurrent_activation );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_R_ACTIVATION] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( grucell_h_times_activation_r, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_reset_after_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_reset_after_activation_cpu.c
deleted file mode 100644
index cfd0eb1..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/grucell_reset_after_activation_cpu.c
+++ /dev/null
@@ -1,271 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2021 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (GRUCELL_ACT_IN_CNT)
-#define _OUTPUT_NUM         (GRUCELL_ACT_OUT_CNT)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.grucell_reset_after_activation")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _grucell_reset_after_activation_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*activation*/
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*recurrent_activation*/
-    // Add kererl parameters here
-};
-#define _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM  _cnt_of_array( _grucell_reset_after_activation_kernel_param_def )
-#define SCALAR_ACTIVATION          (9)
-#define SCALAR_R_ACTIVATION        (10)
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM]   = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM]   = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM]    = {0};
-    vsi_size_t i, b;
-    int32_t  activation = 0;
-    int32_t  recurrent_activation = 0;
-    vsi_size_t n_batch               = 0;
-    vsi_size_t n_cell                = 0;
-
-    /* prepare data */
-    for ( i = 0; i < _INPUT_NUM; i++ )
-    {
-        input[i]   = (vsi_nn_kernel_tensor_t)param[i];
-        if (input[i])
-        {
-            in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-            vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
-            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-        }
-    }
-
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        output[i]   = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        if (output[i])
-        {
-            out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-            vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-            out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-            out_bytes[i] = out_elements[i] * sizeof(float);
-            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-            memset( f32_out_buffer[i], 0, out_bytes[i] );
-        }
-    }
-
-    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ACTIVATION], &activation );
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_R_ACTIVATION],
-        &recurrent_activation );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    n_cell  = in_attr[GRUCELL_ACT_H_STATE]->shape->data[0];
-    n_batch = in_attr[GRUCELL_ACT_H_STATE]->shape->data[1];
-
-    for (b = 0; b < n_batch; b ++)
-    {
-        for (i = 0; i < n_cell; i++)
-        {
-            vsi_size_t index = i + n_cell * b;
-            float data_z_t = 0;
-            float data_r_t = 0;
-            float data_h_t = 0;
-            float r_times_h = 0;
-            float hstate_in = f32_in_buffer[GRUCELL_ACT_H_STATE][index];
-            float dst = 0;
-
-            data_z_t = f32_in_buffer[GRUCELL_ACT_I_FC_Z][index];
-            data_r_t = f32_in_buffer[GRUCELL_ACT_I_FC_R][index];
-            data_h_t = f32_in_buffer[GRUCELL_ACT_I_FC_H][index];
-            data_z_t += f32_in_buffer[GRUCELL_ACT_H_FC_Z][index];
-            data_r_t += f32_in_buffer[GRUCELL_ACT_H_FC_R][index];
-
-            data_z_t = vsi_nn_activation(data_z_t, recurrent_activation);
-            data_r_t = vsi_nn_activation(data_r_t, recurrent_activation);
-
-            r_times_h = f32_in_buffer[GRUCELL_ACT_H_FC_H][index] * data_r_t;
-            data_h_t += r_times_h;
-
-            data_h_t = vsi_nn_activation(data_h_t, activation);
-
-            dst = (1 - data_z_t ) * data_h_t + data_z_t * hstate_in;
-
-            f32_out_buffer[GRUCELL_ACT_OUT_OUTPUT][index] = dst;
-            f32_out_buffer[GRUCELL_ACT_OUT_H_STATE][index] = dst;
-        }
-    }
-
-    /* save data */
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (output[i])
-        {
-            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                    f32_out_buffer[i], out_elements[i] );
-            CHECK_STATUS_FAIL_GOTO( status, final );
-        }
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _grucell_reset_after_activation_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _grucell_reset_after_activation_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
-    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_ACTIVATION] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &activation );
-            node_params[SCALAR_R_ACTIVATION] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &recurrent_activation );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ACTIVATION] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_R_ACTIVATION] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( grucell_reset_after_activation, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c b/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c
deleted file mode 100644
index 61f6cd2..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <float.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (2)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.heatmap_max_keypoint")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _heatmap_max_keypoint_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _HEATMAP_MAX_KEYPOINT_PARAM_NUM  _cnt_of_array( _heatmap_max_keypoint_kernel_param_def )
-
-// This function uses Taylor expansion up to the quatratic term to approximate bicubic
-// upscaling result.
-// 2nd order Taylor expansion: D(x) = D - b'x + 1/2 * x'Ax
-// where D = grid[1][1], Taylor expansion center, the original score,
-//       x = delta, the correction on max keypoint position,
-//       D(x) = deltaScore, the accuracy score after correction
-static void _solve_for_delta
-    (
-    const float grid[3][3],
-    float* delta,
-    float* deltaScore,
-    float fpAtol,
-    float fpRtol
-    )
-{
-    // b: negative 1st order derivative at center
-    // A: Hessian matrix at center (2nd order derivative)
-    float A[2][2], b[2];
-    float crossProd1, crossProd2;
-    float detA;
-    b[0] = -(grid[1][2] - grid[1][0]) / 2.0f;
-    b[1] = -(grid[2][1] - grid[0][1]) / 2.0f;
-    A[0][0] = grid[1][0] - 2.0f * grid[1][1] + grid[1][2];
-    A[0][1] = (grid[2][2] - grid[2][0] - grid[0][2] + grid[0][0]) / 4.0f;
-    A[1][0] = A[0][1];
-    A[1][1] = grid[0][1] - 2.0f * grid[1][1] + grid[2][1];
-
-    // solve Ax=b, where x=delta -> delta = inv(A) * b
-    crossProd1 = A[0][0] * A[1][1];
-    crossProd2 = A[0][1] * A[1][0];
-    detA = crossProd1 - crossProd2;
-    // check if A is invertible
-    if (fabs(detA) < (fpAtol + fpRtol * crossProd1)) return;
-    delta[0] = (A[1][1] * b[0] - A[0][1] * b[1]) / detA;
-    delta[1] = (A[0][0] * b[1] - A[1][0] * b[0]) / detA;
-
-    // clip out of range delta, i.e. delta > 3/2
-    if (fabs(delta[0]) > 1.5f || fabs(delta[1]) > 1.5f)
-    {
-        float scale = (float)(1.5f / vsi_nn_max(fabs(delta[0]), fabs(delta[1])));
-        delta[0] *= scale;
-        delta[1] *= scale;
-    }
-
-    *deltaScore = grid[1][1] - b[0] * delta[0] - b[1] * delta[1] +
-                  ((A[0][0] * delta[0] + A[0][1] * delta[1]) * delta[0] +
-                   (A[1][0] * delta[0] + A[1][1] * delta[1]) * delta[1]) /
-                          2.0f;
-}
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i = 0;
-    uint32_t j = 0;
-    uint32_t k = 0;
-    vsi_size_t numBoxes = 0;
-    vsi_size_t heatmapSize = 0;
-    vsi_size_t numKeypoints = 0;
-    uint32_t boxInfoLength = 4;
-    uint32_t output_score_index = 0;
-    uint32_t output_keypoint_index = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    numBoxes = in_attr[0]->shape->data[3];
-    heatmapSize = in_attr[0]->shape->data[2];
-    numKeypoints = in_attr[0]->shape->data[0];
-
-    for(i = 0; i < numBoxes; i++)
-    {
-        for (j = 0; j < numKeypoints; j++)
-        {
-            uint32_t maxIndex = 0;
-            float maxScore = -FLT_MAX;
-            vsi_size_t maxIndexWidth;
-            vsi_size_t maxIndexHeight;
-            float localGrid[3][3] = {{0}};
-            int32_t dh, dw;
-            float delta[2] = {0.0f, 0.0f}, deltaScore;
-            float wRoiStart = f32_in_buffer[1][i * boxInfoLength];
-            float hRoiStart = f32_in_buffer[1][i * boxInfoLength + 1];
-            float wRoiEnd = f32_in_buffer[1][i * boxInfoLength + 2];
-            float hRoiEnd = f32_in_buffer[1][i * boxInfoLength + 3];
-            float roiWidth = wRoiEnd - wRoiStart;
-            float roiHeight = hRoiEnd - hRoiStart;
-            float wRelativePos;
-            float hRelativePos;
-            for (k = 0; k < heatmapSize * heatmapSize; k++)
-            {
-                vsi_size_t index = i * heatmapSize * heatmapSize * numKeypoints
-                    + k * numKeypoints + j;
-                float val = f32_in_buffer[0][index];
-                if (maxScore < val)
-                {
-                    maxScore = val;
-                    maxIndex = k;
-                }
-            }
-            maxIndexWidth = maxIndex % heatmapSize;
-            maxIndexHeight = maxIndex / heatmapSize;
-
-            // get local 3x3 grid
-            for (dh = -1; dh <= 1; dh++)
-            {
-                for (dw = -1; dw <= 1; dw++)
-                {
-                    // cast uint32_t to int32_t
-                    vsi_ssize_t h = (vsi_ssize_t)(maxIndexHeight) + dh;
-                    vsi_ssize_t w = (vsi_ssize_t)(maxIndexWidth) + dw;
-                    vsi_size_t heatmapIndex;
-
-                    // use mirroring for out of bound indexing
-                    // need to ensure heatmapSize >= 2
-                    h = h < 0 ? 1 : (h >= (vsi_ssize_t)heatmapSize ? heatmapSize - 2 : h);
-                    w = w < 0 ? 1 : (w >= (vsi_ssize_t)heatmapSize ? heatmapSize - 2 : w);
-
-                    heatmapIndex = i * heatmapSize * heatmapSize * numKeypoints +
-                        (vsi_size_t)(h) * heatmapSize * numKeypoints +
-                        (vsi_size_t)(w) * numKeypoints + j;
-                    localGrid[dh + 1][dw + 1] = f32_in_buffer[0][heatmapIndex];
-                }
-            }
-            deltaScore = maxScore;
-            _solve_for_delta((const float (*)[3])localGrid, delta, &deltaScore, 1e-3f, 1e-3f);
-
-            wRelativePos = ((float)(maxIndexWidth) + delta[0] + 0.5f) /
-                (float)(heatmapSize);
-            hRelativePos = ((float)(maxIndexHeight) + delta[1] + 0.5f) /
-                (float)(heatmapSize);
-            f32_out_buffer[0][output_score_index] = deltaScore;
-            f32_out_buffer[1][output_keypoint_index] = wRelativePos * roiWidth + wRoiStart;
-            f32_out_buffer[1][output_keypoint_index + 1] = hRelativePos * roiHeight + hRoiStart;
-            output_score_index++;
-            output_keypoint_index += 2;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-            f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        vsi_nn_safe_free(f32_in_buffer[i]);
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        vsi_nn_safe_free(f32_out_buffer[i]);
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _heatmap_max_keypoint_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _heatmap_max_keypoint_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_HEATMAP_MAX_KEYPOINT_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _HEATMAP_MAX_KEYPOINT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _HEATMAP_MAX_KEYPOINT_PARAM_NUM );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( heatmap_max_keypoint, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
deleted file mode 100644
index 24a1db4..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (3)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.instance_norm")
-
-DEF_KERNEL_EXECUTOR(_instance_norm_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t batch = 1;
-    vsi_size_t depth = 1;
-    vsi_size_t norm_size = 1;
-    vsi_size_t b = 0;
-    vsi_size_t c = 0;
-    vsi_size_t i = 0;
-    size_t rank = 1;
-    float eps = .0f;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final );
-
-    buffer[3] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
-    memset( buffer[3], 0, out_elements * sizeof(float) );
-
-    rank = attr[0]->shape->size;
-
-    batch = attr[0]->shape->data[rank - 1];
-    depth = attr[0]->shape->data[rank - 2];
-
-    for ( i = 0; i < (vsi_size_t)rank - 2; i++)
-    {
-        norm_size *= attr[0]->shape->data[i];
-    }
-
-    for (b = 0; b < batch; b++)
-    {
-        for (c = 0; c < depth; c++)
-        {
-            vsi_size_t page = c * norm_size + b * norm_size * depth;
-            float sum = .0f;
-            float sumsq = .0f;
-            float mean = .0f;
-            float vari = .0f;
-            float data = 0;
-            float scaleVal = buffer[2][c];
-            float biasVal = buffer[1][c];
-
-            for (i = 0; i < norm_size; i++)
-            {
-                vsi_size_t index = page + i;
-                sum += buffer[0][index];
-            }
-
-            mean = sum / (float)norm_size;
-
-            for (i = 0; i < norm_size; i++)
-            {
-                vsi_size_t index = page + i;
-                data = buffer[0][index] - mean;
-                sumsq += data * data;
-            }
-
-            vari = sumsq / (float)norm_size;
-            vari = (float)(1.0 / sqrtf(vari + eps));
-
-            for (i = 0; i < norm_size; i++)
-            {
-                float normVal = 0;
-                vsi_size_t index = page + i;
-                data = buffer[0][index] - mean;
-
-                normVal = data * vari * scaleVal + biasVal;
-                buffer[3][index] = normVal;
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            buffer[3], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _instance_norm_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _instance_normalization_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _INSTANCE_NORMALIZATION_PARAM_NUM  _cnt_of_array( _instance_normalization_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _instance_norm_exec;
-    kernel->info.parameters  = _instance_normalization_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _instance_normalization_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( instance_norm, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c b/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c
deleted file mode 100644
index c220601..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c
+++ /dev/null
@@ -1,246 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.l2normalizescale")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _l2normalizescale_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-#define _L2NORMALIZESCALE_PARAM_NUM  _cnt_of_array( _l2normalizescale_kernel_param_def )
-
-#define SCALAR_INPUT_AXIS          (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-    vsi_ssize_t index;
-    int32_t  axis        = 0;
-    vsi_ssize_t  outerSize   = 1;
-    vsi_ssize_t  axisSize    = 1;
-    vsi_ssize_t  innerSize   = 1;
-    vsi_ssize_t  inner       = 0;
-    int32_t  outer       = 0;
-    float    rsqrt = 0.0f, scaleValue = 0.0f;
-    float    epsilon = (float)10e-12;
-    float    l2Value = 0.0f, tmpValue = 0.0f;
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for (i = 0; i < (uint32_t)axis; i++)
-    {
-        innerSize *= in_attr[0]->shape->data[i];
-    }
-
-    axisSize = in_attr[0]->shape->data[axis];
-
-    for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++)
-    {
-        outerSize *= in_attr[0]->shape->data[i];
-    }
-
-    for (outer = 0; outer < outerSize; ++outer) {
-        for (inner = 0; inner < innerSize; ++inner) {
-            float sum      = 0.0f;
-
-            for (i = 0; i < (uint32_t)axisSize; ++i) {
-                index     = (outer * axisSize + i) * innerSize + inner;
-                tmpValue  = f32_in_buffer[0][index];
-                sum      += tmpValue * tmpValue;
-            }
-            rsqrt = 1.0f / sqrtf(vsi_nn_max(sum, epsilon));
-            for (i = 0; i < (uint32_t)axisSize; ++i) {
-                index      = (outer * axisSize + i) * innerSize + inner;
-                tmpValue   = f32_in_buffer[0][index];;
-                scaleValue = f32_in_buffer[1][i];;
-                l2Value    = tmpValue * rsqrt * scaleValue;
-                f32_out_buffer[0][index] = l2Value;
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _l2normalizescale_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _l2normalizescale_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_L2NORMALIZESCALE_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis   = vsi_nn_kernel_param_get_int32(params, "axis");
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _L2NORMALIZESCALE_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _L2NORMALIZESCALE_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( l2normalizescale, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
deleted file mode 100644
index 1329ce3..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
+++ /dev/null
@@ -1,243 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (3)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.layer_norm")
-
-DEF_KERNEL_EXECUTOR(_layer_norm_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t i = 0;
-    float eps = .0f;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final );
-
-    buffer[3] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
-    memset( buffer[3], 0, out_elements * sizeof(float) );
-
-    {
-        vsi_size_t  axis_first = 0;
-        vsi_size_t  axis_num  = 1;
-        vsi_size_t  outerSize = 1;
-        vsi_size_t  axisSize  = 1;
-        vsi_size_t  innerSize = 1;
-        vsi_size_t  inner     = 0;
-        vsi_size_t  outer     = 0;
-
-        for (i = 0; i < axis_first; i++)
-        {
-            innerSize *= attr[0]->shape->data[i];
-        }
-
-        for(i = 0; i < axis_num; i++)
-        {
-            axisSize *= attr[0]->shape->data[axis_first + i];
-        }
-
-        for (i = axis_first + axis_num; i < attr[0]->shape->size; i++)
-        {
-            outerSize *= attr[0]->shape->data[i];
-        }
-
-        for ( outer = 0; outer < outerSize; ++outer)
-        {
-            for ( inner = 0; inner < innerSize; ++inner)
-            {
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-
-                for (i = 0; i < axisSize; ++i)
-                {
-                    float value = buffer[0][(outer * axisSize + i) * innerSize + inner];
-                    sum += value;
-                    sumsq += (value * value);
-                }
-                mean = sum / (axisSize);
-                vari = sumsq / (axisSize) - mean * mean;
-                vari = (float)(1.0 / sqrtf(vari + eps));
-
-                for (i = 0; i < axisSize; ++i)
-                {
-                    vsi_ssize_t idx = (outer * axisSize + i) * innerSize + inner;
-                    float data = buffer[0][idx] - mean;
-                    float scaleVal = buffer[2][i];
-                    float biasVal = buffer[1][i];
-                    float normVal = data * vari * scaleVal + biasVal;
-                    buffer[3][idx] = normVal;
-                }
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            buffer[3], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _layer_norm_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _layer_normalization_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _LAYER_NORMALIZATION_PARAM_NUM  _cnt_of_array( _layer_normalization_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _layer_norm_exec;
-    kernel->info.parameters  = _layer_normalization_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _layer_normalization_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( layer_norm, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
deleted file mode 100644
index 67e0d84..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
+++ /dev/null
@@ -1,223 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (2)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("log_softmax_sw")
-
-DEF_KERNEL_EXECUTOR(_log_softmax_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t axis = 0;
-    float beta = 0;
-    vsi_ssize_t outerSize = 1;
-    vsi_ssize_t axisSize = 1;
-    vsi_ssize_t innerSize = 1;
-    vsi_ssize_t i = 0;
-    vsi_ssize_t inner = 0;
-    vsi_ssize_t outer = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &beta);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    for (i = 0; i < axis; i++)
-    {
-        innerSize *= attr[0]->shape->data[i];
-    }
-
-    axisSize = attr[0]->shape->data[axis];
-
-    for (i = axis + 1; i < (vsi_ssize_t)attr[0]->shape->size; i++)
-    {
-        outerSize *= attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            // We subtract the maximum value from each element to ensure
-            // numerical stability, taking advantage of the following equality:
-            // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
-            float sum = 0;
-            float logSum = 0;
-            float maxValue = buffer[0][outer * axisSize * innerSize + inner];
-            for (i = 1; i < axisSize; ++i)
-            {
-                maxValue = vsi_nn_max(maxValue, buffer[0][(outer * axisSize + i) * innerSize + inner]);
-            }
-
-            sum = 0;
-            for (i = 0; i < axisSize; ++i)
-            {
-                sum += expf((buffer[0][(outer * axisSize + i) * innerSize + inner] - maxValue) * beta);
-            }
-
-            logSum = logf(sum);
-            for (i = 0; i < axisSize; ++i)
-            {
-                buffer[1][(outer * axisSize + i) * innerSize + inner] =
-                        (buffer[0][(outer * axisSize + i) * innerSize + inner] - maxValue) * beta -
-                        logSum;
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _log_softmax_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _log_softmax_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-#define SCALAR_INPUT_AXIS          (2)
-#define SCALAR_INPUT_BETA          (3)
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-    float beta = 1.0f;
-
-    axis = vsi_nn_kernel_param_get_int32(params, "axis");
-    beta = vsi_nn_kernel_param_get_float32(params, "beta");
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-            backend_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &beta );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
-            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_BETA] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( log_softmax, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c b/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c
deleted file mode 100644
index 9bcdcab..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c
+++ /dev/null
@@ -1,197 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.logical_not")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _logical_not_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _LOGICAL_NOT_PARAM_NUM  _cnt_of_array( _logical_not_kernel_param_def )
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        f32_out_buffer[0][i] = (float)(!f32_in_buffer[0][i]);
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _logical_not_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _logical_not_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_LOGICAL_NOT_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs);
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_NOT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_NOT_PARAM_NUM );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( logical_not, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c b/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c
deleted file mode 100644
index 07deb44..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _CPU_IO_NUM         (_INPUT_NUM + _OUTPUT_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.logical_ops")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _logical_ops_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _LOGICAL_OPS_PARAM_NUM  _cnt_of_array( _logical_ops_kernel_param_def )
-
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for( i = 0; i < rank && index; i ++ )
-    {
-        if( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-    return offset;
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    uint32_t  ops_type_int = 0;
-    vsi_nn_logical_ops_type_t  ops_type = VSI_NN_LOGICAL_OR;
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-    vsi_nn_kernel_scalar_read_uint32((vsi_nn_kernel_scalar_t)param[_CPU_IO_NUM], &(ops_type_int));
-    ops_type = (vsi_nn_logical_ops_type_t)ops_type_int;
-    if (!(VSI_NN_LOGICAL_OR == ops_type || VSI_NN_LOGICAL_AND == ops_type || VSI_NN_LOGICAL_XOR == ops_type))
-    {
-        status = VSI_FAILURE;
-        goto final;
-    }
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        vsi_ssize_t  in0_offset = 0;
-        vsi_ssize_t  in1_offset = 0;
-        vsi_ssize_t  in0 = 0;
-        vsi_ssize_t  in1 = 0;
-
-
-        in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size,
-                in_stride_size[0], out_attr[0]->shape->data );
-        in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size,
-                in_stride_size[1], out_attr[0]->shape->data );
-        in0 = (!!(f32_in_buffer[0][in0_offset]));
-        in1 = (!!(f32_in_buffer[1][in1_offset]));
-        if (VSI_NN_LOGICAL_OR == ops_type)
-        {
-            f32_out_buffer[0][i] = (float)(in0 || in1);
-        }
-        else if (VSI_NN_LOGICAL_AND == ops_type)
-        {
-            f32_out_buffer[0][i] = (float)(in0 && in1);
-        }
-        else if (VSI_NN_LOGICAL_XOR == ops_type)
-        {
-            f32_out_buffer[0][i] = (float)(in0 ^ in1);
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _logical_ops_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _logical_ops_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_LOGICAL_OPS_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    uint32_t ops_type  = vsi_nn_kernel_param_get_int32( params, "ops_type" );
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_OPS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[_CPU_IO_NUM] = vsi_nn_kernel_scalar_create( graph, U32, &ops_type );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_OPS_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( logical_ops, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c b/src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c
deleted file mode 100644
index 0f66636..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.lppool")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _lppool_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _LPPOOL_PARAM_NUM  _cnt_of_array( _lppool_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_lppool_exec)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float * buffer[_INPUT_NUM + _OUTPUT_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM + _OUTPUT_NUM] = { NULL };
-    int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0;
-    int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0;
-    int32_t p = 0;
-    int32_t i = 0;
-    input[0] = (vsi_nn_kernel_tensor_t)param[0];
-    output[0] = (vsi_nn_kernel_tensor_t)param[1];
-    attr[0] = vsi_nn_kernel_tensor_attr_create( input[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( output[0] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &ksize_x);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_y);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &pad_left);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &pad_right);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &pad_top);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_bottom);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &stride_x);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &stride_y);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &p);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( input[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    {
-        int32_t batch    = (int32_t)attr[1]->shape->data[2];
-        int32_t height_o = (int32_t)attr[1]->shape->data[1];
-        int32_t width_o  = (int32_t)attr[1]->shape->data[0];
-        int32_t height   = (int32_t)attr[0]->shape->data[1];
-        int32_t width    = (int32_t)attr[0]->shape->data[0];
-        int32_t b = 0, j = 0;
-        int32_t output_base = 0;
-        int32_t input_base  = 0;
-        float data = 0;
-        for (b = 0; b < batch; b++)
-        {
-            output_base = b * height_o * width_o;
-            input_base = b * height * width;
-            for (j = 0; j < height_o; j++)
-            {
-                for (i = 0; i < width_o; i++)
-                {
-                    int32_t hstart = j * stride_y - pad_top;
-                    int32_t wstart = i * stride_x - pad_left;
-                    int32_t hend = vsi_nn_min(hstart + ksize_y, height);
-                    int32_t wend = vsi_nn_min(wstart + ksize_x, width);
-                    int32_t pool_index = output_base + j * width_o + i;
-                    int32_t h = 0, w = 0;
-                    float sum_of_pow = 0;
-                    float out_data = 0;
-                    hstart = vsi_nn_max(hstart, 0);
-                    wstart = vsi_nn_max(wstart, 0);
-
-                    for (h = hstart; h < hend; ++ h)
-                    {
-                        for (w = wstart; w < wend; ++ w)
-                        {
-                            int32_t index = input_base + h * width + w;
-                            data = buffer[0][index];
-                            sum_of_pow += (float)pow(fabs(data),p);
-                        }
-                    }
-                    out_data = (float)pow(sum_of_pow, 1.0f / p);
-                    buffer[1][pool_index] = out_data;
-                }
-            }
-        }
-
-    }
-    status = vsi_nn_kernel_tensor_write_from_float( output[0], attr[1],
-            buffer[1], out_elements );
-final:
-    for ( i = 0; i < _INPUT_NUM + _OUTPUT_NUM; i ++ )
-    {
-        vsi_nn_safe_free( buffer[i] );
-        if (attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &attr[i] );
-        }
-    }
-
-    return status;
-} /* _lppool_exec() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _lppool_exec;
-    kernel->info.parameters  = _lppool_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _lppool_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_LPPOOL_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-
-    int32_t ksize_x    = vsi_nn_kernel_param_get_int32(params, "ksize_x");
-    int32_t ksize_y    = vsi_nn_kernel_param_get_int32(params, "ksize_y");
-    int32_t stride_x   = vsi_nn_kernel_param_get_int32(params, "stride_x");
-    int32_t stride_y   = vsi_nn_kernel_param_get_int32(params, "stride_y");
-    int32_t pad_left   = vsi_nn_kernel_param_get_int32(params, "pad_left");
-    int32_t pad_right  = vsi_nn_kernel_param_get_int32(params, "pad_right");
-    int32_t pad_top    = vsi_nn_kernel_param_get_int32(params, "pad_top");
-    int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
-    int32_t p          = vsi_nn_kernel_param_get_int32(params, "p");
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            int32_t index = 2;
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _LPPOOL_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &p );
-
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _LPPOOL_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[2] );
-            vsi_nn_kernel_scalar_release( &node_params[3] );
-            vsi_nn_kernel_scalar_release( &node_params[4] );
-            vsi_nn_kernel_scalar_release( &node_params[5] );
-            vsi_nn_kernel_scalar_release( &node_params[6] );
-            vsi_nn_kernel_scalar_release( &node_params[7] );
-            vsi_nn_kernel_scalar_release( &node_params[8] );
-            vsi_nn_kernel_scalar_release( &node_params[9] );
-            vsi_nn_kernel_scalar_release( &node_params[10] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( lppool, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c
deleted file mode 100644
index ade68ef..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (LSTMUNIT_ACT_INPUTS_COUNT)
-#define _OUTPUT_NUM         (LSTMUNIT_ACT_OUTUTS_COUNT)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.lstmunit_activation")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _lstmunit_activation_kernel_param_def[] =
-{
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*0  input_fc_i */
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },  /*1  input_fc_f */
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },  /*2  input_fc_c */
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },  /*3  input_fc_o */
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },  /*4  cs_in */
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*5  hstate_fc_i */
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*6  hstate_fc_f */
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*7  hstate_fc_c */
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*8  hstate_fc_o */
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*9  biases_i*/
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*10 biases_f*/
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*11 biases_c*/
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*12 biases_o*/
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*13 ln_w_i*/
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*14 ln_w_f*/
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*15 ln_w_c*/
-    { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*16 ln_w_o*/
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },  /*17 output*/
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },  /*18 cs_out*/
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },  /*19 hs_out*/
-    { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },  /*20 _is_ln*/
-    { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },  /*21 _is_cifg*/
-    { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },  /*22 _is_proj*/
-    { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },  /*23 _is_hybrid*/
-    { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },  /*24 recurrent_activation*/
-    { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },  /*25 forget_bias*/
-};
-#define _LSTMUNIT_ACTIVATION_PARAM_NUM  _cnt_of_array( _lstmunit_activation_kernel_param_def )
-
-#define SCALAR_IS_LN               (20)
-#define SCALAR_IS_CIFG             (21)
-#define SCALAR_IS_PROG             (22)
-#define SCALAR_IS_HYBRID           (23)
-#define SCALAR_ACTIVATION          (24)
-#define SCALAR_FORGET_BIAS         (25)
-
-static float activationFunctor(float a, vsi_nn_activation_e act_)
-{
-    switch (act_)
-    {
-      case VSI_NN_ACT_NONE:
-        return a;
-      case VSI_NN_ACT_RELU:
-        return a < 0.f ? 0.f : a;
-      case VSI_NN_ACT_RELU6:
-        return vsi_nn_max(0.f, vsi_nn_min(a, 6.f));
-      case VSI_NN_ACT_TANH:
-        return (float)tanh(a);
-      case VSI_NN_ACT_SIGMOID:
-        return (float)(1.0f / (1.0f + exp(-a)));
-      case VSI_NN_ACT_HARD_SIGMOID:
-          a = a * 0.2f + 0.5f;
-        return vsi_nn_max(0.f, vsi_nn_min(a, 1.f));
-      default:
-        // TODO(aselle): More informative fatal error!
-        exit(1);
-    }
-}
-
-#define gcoMATH_Exp(X)        (float)(expf((X)))
-#define gcoMATH_TangentH(X)   (float)(tanhf((X)))
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM]   = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM]   = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM]    = {0};
-    vsi_size_t i, b;
-    int32_t  _is_ln                = 0;
-    int32_t  _is_cifg              = 0;
-    int32_t  _is_proj              = 0;
-    int32_t  _is_hybrid            = 0;
-    int32_t  recurrent_activation;
-    vsi_nn_activation_e activation_mode;
-    vsi_size_t n_batch               = 0;
-    vsi_size_t n_cell                = 0;
-    float    forget_bias;
-    /* prepare data */
-    for( i = 0; i < _INPUT_NUM; i++ )
-    {
-        input[i]   = (vsi_nn_kernel_tensor_t)param[i];
-        if (input[i])
-        {
-            in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-            vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
-            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-        }
-    }
-
-    for( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        output[i]   = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        if (output[i])
-        {
-            out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-            vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-            out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-            out_bytes[i] = out_elements[i] * sizeof(float);
-            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-            memset( f32_out_buffer[i], 0, out_bytes[i] );
-        }
-    }
-
-    status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[SCALAR_IS_LN], &_is_ln );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[SCALAR_IS_CIFG], &_is_cifg );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[SCALAR_IS_PROG], &_is_proj );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[SCALAR_IS_HYBRID], &_is_hybrid );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ACTIVATION], &recurrent_activation );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    activation_mode = (vsi_nn_activation_e)recurrent_activation;
-    status = vsi_nn_kernel_scalar_read_float32( (vsi_nn_kernel_scalar_t)param[SCALAR_FORGET_BIAS], &forget_bias );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    n_cell  = in_attr[LSTMUNIT_ACT_CSTATE_IN]->shape->data[0];
-    n_batch = in_attr[LSTMUNIT_ACT_CSTATE_IN]->shape->data[1];
-
-    for (b = 0; b < n_batch; b ++)
-    {
-        for (i = 0; i < n_cell; i++)
-        {
-            vsi_size_t index = i + n_cell * b;
-            float    data_i_t = 0;
-            float    data_f_t = 0;
-            float    data_g_t = 0;
-            float    data_o_t = 0;
-            float    data_c_t = 0;
-            float    data_h_t = 0;
-
-            data_i_t = _is_cifg ? 0 : f32_in_buffer[LSTMUNIT_ACT_INPUT_FC_I][index];
-            data_f_t = f32_in_buffer[LSTMUNIT_ACT_INPUT_FC_F][index];
-            data_g_t = f32_in_buffer[LSTMUNIT_ACT_INPUT_FC_C][index];
-            data_o_t = f32_in_buffer[LSTMUNIT_ACT_INPUT_FC_O][index];
-            data_c_t = f32_in_buffer[LSTMUNIT_ACT_CSTATE_IN][index];
-
-            if (!_is_ln)
-            {
-                data_i_t += _is_cifg ? 0 : f32_in_buffer[LSTMUNIT_ACT_HSTATE_FC_I][index];
-                data_f_t += f32_in_buffer[LSTMUNIT_ACT_HSTATE_FC_F][index];
-                data_g_t += f32_in_buffer[LSTMUNIT_ACT_HSTATE_FC_C][index];
-                data_o_t += f32_in_buffer[LSTMUNIT_ACT_HSTATE_FC_O][index];
-            }
-
-            if (!_is_cifg)
-            {
-                if (_is_ln)
-                {
-                    data_i_t *= f32_in_buffer[LSTMUNIT_ACT_LN_WI][i];
-                    data_i_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BI][i];
-                }
-                else if (_is_hybrid)
-                {
-                    data_i_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BI][i];
-                }
-            }
-
-            if (_is_ln)
-            {
-                data_f_t *= f32_in_buffer[LSTMUNIT_ACT_LN_WF][i];
-                data_f_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BF][i];
-                data_g_t *= f32_in_buffer[LSTMUNIT_ACT_LN_WC][i];
-                data_g_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BC][i];
-                data_o_t *= f32_in_buffer[LSTMUNIT_ACT_LN_WO][i];
-                data_o_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BO][i];
-            }
-            else if (_is_hybrid)
-            {
-                data_f_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BF][i];
-                data_g_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BC][i];
-                data_o_t += f32_in_buffer[LSTMUNIT_ACT_DATA_BO][i];
-            }
-
-            data_f_t += forget_bias;
-            data_f_t = activationFunctor(data_f_t, activation_mode);
-
-            if (_is_cifg)
-                data_i_t = 1 - data_f_t;
-            else
-                data_i_t = activationFunctor(data_i_t, activation_mode);
-            data_g_t = gcoMATH_TangentH(data_g_t);
-            data_o_t = activationFunctor(data_o_t, activation_mode);
-            data_c_t = data_f_t * data_c_t + data_i_t * data_g_t;
-            data_h_t = data_o_t * gcoMATH_TangentH(data_c_t);
-
-            f32_out_buffer[LSTMUNIT_ACT_CSTATE_OUT][index] = data_c_t;
-            f32_out_buffer[LSTMUNIT_ACT_OUTPUT][index]     = data_h_t;
-
-            if (!_is_proj)
-            {
-                f32_out_buffer[LSTMUNIT_ACT_HSTATE_OUT][index] = data_h_t;
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (output[i])
-        {
-            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                    f32_out_buffer[i], out_elements[i] );
-            CHECK_STATUS_FAIL_GOTO( status, final );
-        }
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _lstmunit_activation_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _lstmunit_activation_kernel_param_def );
-    status                   = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_LSTMUNIT_ACTIVATION_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t  _is_ln= 0;
-    int32_t  _is_cifg= 0;
-    int32_t  _is_proj= 0;
-    int32_t  _is_hybrid= 0;
-    int32_t  recurrent_activation;
-    float    forget_bias;
-
-    _is_ln               = vsi_nn_kernel_param_get_int32( params, "_is_ln" );
-    _is_cifg             = vsi_nn_kernel_param_get_int32( params, "_is_cifg" );
-    _is_proj             = vsi_nn_kernel_param_get_int32( params, "_is_proj" );
-    _is_hybrid           = vsi_nn_kernel_param_get_int32( params, "_is_hybrid" );
-    recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
-    forget_bias          = vsi_nn_kernel_param_get_float32(params, "forget_bias");
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _LSTMUNIT_ACTIVATION_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_IS_LN] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &_is_ln );
-            node_params[SCALAR_IS_CIFG] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &_is_cifg );
-            node_params[SCALAR_IS_PROG] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &_is_proj );
-            node_params[SCALAR_IS_HYBRID] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &_is_hybrid );
-            node_params[SCALAR_ACTIVATION] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &recurrent_activation );
-            node_params[SCALAR_FORGET_BIAS] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &forget_bias );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _LSTMUNIT_ACTIVATION_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_LN] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_CIFG] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_PROG] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_IS_HYBRID] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ACTIVATION] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_FORGET_BIAS] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( lstmunit_activation, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
deleted file mode 100644
index 846df68..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (2)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.matrixmul")
-
-DEF_KERNEL_EXECUTOR(_matrixmul_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[3] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t i = 0;
-    vsi_size_t M = 0, K = 0, N = 0;
-    int32_t transposeA = 0, transposeB = 0;
-    size_t strides0[2] = {0, 0}, strides1[2] = {0, 0};
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &transposeA);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &transposeB);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    K = attr[0]->shape->data[0];
-    M = attr[2]->shape->data[1];
-    N = attr[2]->shape->data[0];
-
-    if(transposeA)
-    {
-        K = attr[0]->shape->data[1];
-    }
-
-    strides0[0] = transposeA? 1:K;
-    strides0[1] = transposeA? M:1;
-
-    strides1[0] = transposeB? 1:N;
-    strides1[1] = transposeB? K:1;
-
-    {
-        vsi_size_t batch   = attr[2]->shape->size > 3 ? attr[2]->shape->data[3] : 1;
-        vsi_size_t depth   = attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1;
-        vsi_size_t a_depth = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
-        vsi_size_t b_depth = attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1;
-        vsi_size_t b = 0, c = 0, j = 0, y = 0;
-        vsi_size_t offsetA = 0, offsetB = 0, offsetD = 0;
-        vsi_size_t ac2zero = 1;
-        vsi_size_t bc2zero = 1;
-
-        if((attr[0]->shape->size > attr[1]->shape->size) ||
-            (attr[0]->shape->data[2] > attr[1]->shape->data[2]
-            && attr[0]->shape->size > 2 && attr[1]->shape->size > 2))
-        {
-            bc2zero = 0;
-        }
-        else if((attr[1]->shape->size > attr[0]->shape->size) ||
-            (attr[1]->shape->data[2] > attr[0]->shape->data[2]
-            && attr[0]->shape->size > 2 && attr[1]->shape->size > 2))
-        {
-            ac2zero = 0;
-        }
-
-        for(b = 0; b < batch; b++)
-        {
-            for(c = 0; c < depth; c++)
-            {
-                offsetA = c * M * K * ac2zero + b * M * K * a_depth;
-                offsetB = c * N * K * bc2zero + b * N * K * b_depth;
-                offsetD = c * M * N + b * M * N * depth;
-                for(i = 0 ; i < M; i++)
-                {
-                    for(j = 0; j < N; j++)
-                    {
-                        float sum = 0;
-                        for(y = 0; y < K; y++)
-                        {
-                            float dataA = buffer[0][i * strides0[0] + y * strides0[1] + offsetA];
-                            float dataB = buffer[1][y * strides1[0] + j * strides1[1] + offsetB];
-
-                            sum += dataA * dataB;
-                        }
-                        buffer[2][j + i * N + offsetD] = sum;
-                    }
-                }
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < 3; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_yuv420_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _matrixmul_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _MATIRXMUL_PARAM_NUM  _cnt_of_array( _matrixmul_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _matrixmul_exec;
-    kernel->info.parameters  = _matrixmul_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _matrixmul_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t transposeA  = vsi_nn_kernel_param_get_int32( params, "transposeA" );
-    int32_t transposeB  = vsi_nn_kernel_param_get_int32( params, "transposeB" );
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 3;
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeA );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &transposeB );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( matrixmul, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
deleted file mode 100644
index 183fedc..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
+++ /dev/null
@@ -1,213 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (0)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("maximum_sw")
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for ( i = 0; i < rank && index; i ++ )
-    {
-        if (strides[0] == 0)
-        {
-            if (i == 0)
-            {
-                offset += (index % out_shape[0]);
-            }
-            else
-            {
-                offset += (vsi_ssize_t)strides[i] * 2 * ( index % out_shape[i] );
-            }
-        }
-        else if ( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-
-    return offset;
-}
-
-DEF_KERNEL_EXECUTOR(_maximum_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    vsi_size_t out_elements = 0;
-    vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-
-    vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] );
-    vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    for (i = 0; i < out_elements; i++)
-    {
-        vsi_ssize_t in0_offset = 0;
-        vsi_ssize_t in1_offset = 0;
-        float val1 = 0.f;
-        float val2 = 0.f;
-
-        in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size,
-                stride_size[0], attr[2]->shape->data );
-        in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size,
-                stride_size[1], attr[2]->shape->data );
-
-        val1 = buffer[0][in0_offset];
-        val2 = buffer[1][in1_offset];
-
-        buffer[2][i] = vsi_nn_max( val1, val2 );
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _maximum_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _maximum_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( maximum, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c
deleted file mode 100644
index 900451a..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c
+++ /dev/null
@@ -1,284 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (8)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (2)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.maxpoolwithargmax")
-
-#define FP32_MIN                -3.4e38
-
-/*
- * Kernel params
- */
-static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-    // Add kererl parameters here
-};
-#define _MAXPOOLWITHARGMAX_PARAM_NUM  _cnt_of_array( _maxpoolwithargmax_kernel_param_def )
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_maxpoolwithargmax_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0;
-    int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0;
-    int32_t i = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_x);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &ksize_y);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &stride_x);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &stride_y);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_left);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_right);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_top);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_bottom);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    {
-        int32_t dims_num = (int32_t)attr[1]->shape->size;
-        int32_t batch    = dims_num > 3 ? (int32_t)attr[1]->shape->data[3] : 1;
-        int32_t depth    = dims_num > 2 ? (int32_t)attr[1]->shape->data[2] : 1;
-        int32_t height_o = (int32_t)attr[1]->shape->data[1];
-        int32_t width_o  = (int32_t)attr[1]->shape->data[0];
-        int32_t width    = (int32_t)attr[0]->shape->data[0];
-        int32_t height   = (int32_t)attr[0]->shape->data[1];
-        int32_t b = 0, d = 0, j = 0;
-        int32_t output_base = 0;
-        int32_t input_base  = 0;
-
-        for (b = 0; b < batch; b++)
-        {
-            for (d = 0; d < depth; d++)
-            {
-                output_base = b * depth * height_o * width_o + d * height_o * width_o;
-                input_base = b * depth * height * width + d * height * width;
-                for (j = 0; j < height_o; j++)
-                {
-                    for (i = 0; i < width_o; i++)
-                    {
-                        int32_t hstart = j * stride_y - pad_top;
-                        int32_t wstart = i * stride_x - pad_left;
-                        int32_t hend = vsi_nn_min(hstart + ksize_y, height);
-                        int32_t wend = vsi_nn_min(wstart + ksize_x, width);
-                        int32_t pool_index = output_base + j * width_o + i;
-                        int32_t h = 0, w = 0;
-                        int32_t index_max = 0;
-                        float   value_max = (float)FP32_MIN;
-
-                        hstart = vsi_nn_max(hstart, 0);
-                        wstart = vsi_nn_max(wstart, 0);
-
-                        for (h = hstart; h < hend; ++ h)
-                        {
-                            for (w = wstart; w < wend; ++ w)
-                            {
-                                int32_t index = input_base + h * width + w;
-                                float data = buffer[0][index];
-
-                                if (data > value_max)
-                                {
-                                    value_max = data;
-                                    index_max = index;
-                                }
-                            }
-                        }
-                        buffer[1][pool_index] = value_max;
-                        buffer[2][pool_index] = (float)index_max;
-                    }
-                }
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    status |= vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for ( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for ( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _maxpoolwithargmax_exec() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _maxpoolwithargmax_exec;
-    kernel->info.parameters  = _maxpoolwithargmax_kernel_param_def;
-    kernel->info.numParams   = _MAXPOOLWITHARGMAX_PARAM_NUM;
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-
-    int32_t ksize_x    = vsi_nn_kernel_param_get_int32(params, "ksize_x");
-    int32_t ksize_y    = vsi_nn_kernel_param_get_int32(params, "ksize_y");
-    int32_t stride_x   = vsi_nn_kernel_param_get_int32(params, "stride_x");
-    int32_t stride_y   = vsi_nn_kernel_param_get_int32(params, "stride_y");
-    int32_t pad_left   = vsi_nn_kernel_param_get_int32(params, "pad_left");
-    int32_t pad_right  = vsi_nn_kernel_param_get_int32(params, "pad_right");
-    int32_t pad_top    = vsi_nn_kernel_param_get_int32(params, "pad_top");
-    int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            int32_t index = 3;
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[3] );
-            vsi_nn_kernel_scalar_release( &node_params[4] );
-            vsi_nn_kernel_scalar_release( &node_params[5] );
-            vsi_nn_kernel_scalar_release( &node_params[6] );
-            vsi_nn_kernel_scalar_release( &node_params[7] );
-            vsi_nn_kernel_scalar_release( &node_params[8] );
-            vsi_nn_kernel_scalar_release( &node_params[9] );
-            vsi_nn_kernel_scalar_release( &node_params[10] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( maxpoolwithargmax, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
deleted file mode 100644
index 7cb6630..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
+++ /dev/null
@@ -1,209 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (0)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("minimum_sw")
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for  ( i = 0; i < rank && index; i ++ )
-    {
-        if (strides[0] == 0)
-        {
-            if (i == 0)
-            {
-                offset += (index % out_shape[0]);
-            }
-            else
-            {
-                offset += (vsi_ssize_t)strides[i] * 2 * ( index % out_shape[i] );
-            }
-        }
-        else if ( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-
-    return offset;
-}
-
-DEF_KERNEL_EXECUTOR(_minimum_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    vsi_size_t out_elements = 0;
-    vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-
-    vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] );
-    vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    for( i = 0; i < out_elements; i ++ )
-    {
-        vsi_ssize_t in0_offset = 0;
-        vsi_ssize_t in1_offset = 0;
-        float val1 = 0.f;
-        float val2 = 0.f;
-
-        in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size,
-                stride_size[0], attr[2]->shape->data );
-        in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size,
-                stride_size[1], attr[2]->shape->data );
-
-        val1 = buffer[0][in0_offset];
-        val2 = buffer[1][in1_offset];
-
-        buffer[2][i] = vsi_nn_min( val1, val2 );
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _minimum_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _minimum_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( minimum, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c b/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c
deleted file mode 100644
index b391edd..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.mod")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _mod_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _MOD_PARAM_NUM  _cnt_of_array( _mod_kernel_param_def )
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for( i = 0; i < rank && index; i ++ )
-    {
-        if( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-    return offset;
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    int32_t isfmod = 0;
-    vsi_nn_kernel_dtype_e input0_dtype = F16;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float* f32_in_buffer[_INPUT_NUM] = {NULL};
-    float* f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t* in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t* out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-
-    /* prepare data */
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &isfmod);
-    for (i = 0; i < _INPUT_NUM; i++) {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create(input[i]);
-        vsi_nn_kernel_tensor_attr_get_stride(in_attr[i], in_stride_size[i]);
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer(input[i], in_attr[i], TRUE);
-        CHECK_PTR_FAIL_GOTO(f32_in_buffer[i], "Create input0 buffer fail.", final);
-    }
-
-    input0_dtype = in_attr[0]->dtype;
-    if (input0_dtype == F16 || input0_dtype == F32 || input0_dtype == BF16) {
-        isfmod = 1;
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create(output[i]);
-        vsi_nn_kernel_tensor_attr_get_stride(out_attr[i], out_stride_size[i]);
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size(out_attr[i]);
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float*)malloc(out_bytes[i]);
-        CHECK_PTR_FAIL_GOTO(f32_out_buffer[i], "Create output buffer fail.", final);
-        memset(f32_out_buffer[i], 0, out_bytes[i]);
-    }
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        vsi_ssize_t in0_offset = 0;
-        vsi_ssize_t in1_offset = 0;
-        float in0 = 0;
-        float in1 = 0;
-
-        in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size,
-                in_stride_size[0], out_attr[0]->shape->data );
-        in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size,
-                in_stride_size[1], out_attr[0]->shape->data );
-        in0 = f32_in_buffer[0][in0_offset];
-        in1 = f32_in_buffer[1][in1_offset];
-        if (isfmod)
-        {
-            f32_out_buffer[0][i] = (float)fmod(in0,in1);
-        }
-        else
-        {
-            f32_out_buffer[0][i] = in0 - in1 * (float)floor(in0 / in1);
-        }
-    }
-
-    /* save data */
-    for (i = 0; i < _OUTPUT_NUM; i++) {
-        status = vsi_nn_kernel_tensor_write_from_float(
-            output[i], out_attr[i], f32_out_buffer[i], out_elements[i]);
-        CHECK_STATUS_FAIL_GOTO(status, final);
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++) {
-        if (f32_in_buffer[i]) {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i]) {
-            vsi_nn_kernel_tensor_attr_release(&in_attr[i]);
-        }
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i++) {
-        if (f32_out_buffer[i]) {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i]) {
-            vsi_nn_kernel_tensor_attr_release(&out_attr[i]);
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _mod_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _mod_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[3] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( mod, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
deleted file mode 100644
index 431eee7..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
+++ /dev/null
@@ -1,306 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (3)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (2)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.moments")
-
-DEF_KERNEL_EXECUTOR(_moments_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t i = 0;
-    int32_t axis_first = 0;
-    int32_t axis_num = 0;
-    uint32_t mask = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis_first);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis_num);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_uint32((vsi_nn_kernel_scalar_t)param[5], &mask);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    if(mask == 0)
-    {
-        vsi_size_t  outerSize = 1;
-        vsi_size_t  axisSize  = 1;
-        vsi_size_t  innerSize = 1;
-        vsi_size_t  inner     = 0;
-        vsi_size_t  outer     = 0;
-
-        for (i = 0; i < (vsi_size_t)axis_first; i++)
-        {
-            innerSize *= attr[0]->shape->data[i];
-        }
-
-        for(i = 0; i < (vsi_size_t)axis_num; i++)
-        {
-            axisSize *= attr[0]->shape->data[axis_first + i];
-        }
-
-        for (i = (vsi_size_t)axis_first + axis_num; i < attr[0]->shape->size; i++)
-        {
-            outerSize *= attr[0]->shape->data[i];
-        }
-
-        for ( outer = 0; outer < outerSize; ++outer)
-        {
-            for ( inner = 0; inner < innerSize; ++inner)
-            {
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-
-                for (i = 0; i < axisSize; ++i)
-                {
-                    float value = buffer[0][(outer * axisSize + i) * innerSize + inner];
-                    sum += value;
-                    sumsq += (value * value);
-                }
-                mean = sum / (axisSize);
-                vari = sumsq / (axisSize) - mean * mean;
-                buffer[1][outer * innerSize + inner] = (float)mean;
-                buffer[2][outer * innerSize + inner] = (float)vari;
-            }
-        }
-    }
-    else
-    {
-        vsi_size_t  width   = attr[0]->shape->data[0];
-        vsi_size_t  height  = attr[0]->shape->size > 1 ? attr[0]->shape->data[1] : 1;
-        vsi_size_t  channel = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
-        vsi_size_t  batch   = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
-        vsi_size_t  width_o = attr[1]->shape->data[0];
-        vsi_size_t  height_o  = attr[1]->shape->size > 1 ? attr[1]->shape->data[1] : 1;
-        vsi_size_t  channel_o = attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1;
-        vsi_size_t b = 0, c = 0, h = 0;
-        vsi_size_t  wh_offset = width * height;
-        vsi_size_t  axisSize  = width * channel;
-        vsi_size_t  vol = width_o * height_o * channel_o;
-
-        for(b = 0; b < batch; b++)
-        {
-            for(h = 0; h < height; h++)
-            {
-                float sum = .0f;
-                float sumsq = .0f;
-                float mean = .0f;
-                float vari = .0f;
-                vsi_size_t h_offset = h * width;
-                for(c = 0; c < channel; c++)
-                {
-                    vsi_size_t offset = h_offset + c * wh_offset;
-                    for(i = 0; i < width; i++)
-                    {
-                        float value = buffer[0][i + offset];
-                        sum += value;
-                        sumsq += (value * value);
-                    }
-                }
-                mean = sum / (axisSize);
-                vari = sumsq / (axisSize) - mean * mean;
-                buffer[1][b * vol + h] = (float)mean;
-                buffer[2][b * vol + h] = (float)vari;
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    status |= vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_yuv420_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _moments_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _MOMENTS_PARAM_NUM  _cnt_of_array( _moments_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _moments_exec;
-    kernel->info.parameters  = _moments_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _moments_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis_num  = 0;
-    size_t axis_num_temp = 0;
-    int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp);
-    vsi_bool is_continue_axis = TRUE;
-    uint32_t mask = 0;
-    int32_t i = 0;
-
-    axis_num = (int32_t)axis_num_temp;
-
-    for ( i = 1; i < axis_num; i++)
-    {
-        if ( axis[i] != (axis[i - 1] + 1) && axis[0] == 0)
-        {
-            is_continue_axis = FALSE;
-            break;
-        }
-    }
-
-    if (is_continue_axis == FALSE)
-    {
-        for(i = 0; i < axis_num; i++)
-        {
-            mask |= (1 << axis[i]);
-        }
-    }
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            int32_t axis_first  = axis[0];
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis_first );
-            backend_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
-            backend_params[5] = vsi_nn_kernel_scalar_create( graph, U32, &mask );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( moments, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
deleted file mode 100644
index f387d81..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
+++ /dev/null
@@ -1,440 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (3)
- #define _CPU_IO_NUM        (_INPUT_NUM + _OUTPUT_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.nms")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _nms_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define SCALAR_INPUT_MAX_SIZE          (5)
-#define SCALAR_INPUT_IOU_THRES         (6)
-#define SCALAR_INPUT_SCORE_THRES       (7)
-#define SCALAR_INPUT_SOFT_NMS_SIGMA    (8)
-#define _NMS_PARAM_NUM  _cnt_of_array( _nms_kernel_param_def )
-
-typedef struct Candidate_s
-{
-    int index;
-    float score;
-    int suppress_begin_index;
-}Candidate;
-static void _swap_element
-    (
-    Candidate* list,
-    uint32_t first,
-    uint32_t second
-    )
-{
-    Candidate temp;
-    memcpy(&temp, &list[first], sizeof(Candidate));
-    memcpy(&list[first], &list[second], sizeof(Candidate));
-    memcpy(&list[second], &temp, sizeof(Candidate));
-}
-
-static uint32_t _max_element
-    (
-    Candidate* list,
-    uint32_t len
-    )
-{
-    uint32_t i;
-    uint32_t max_index = 0;
-    float max_val = list[0].score;
-    for ( i = 1; i < len; i++ )
-    {
-        float val = list[i].score;
-        if ( max_val < val )
-        {
-            max_val = val;
-            max_index = i;
-        }
-    }
-
-    return max_index;
-}
-
-typedef struct box_corner_encoding_s
-{
-  float y1;
-  float x1;
-  float y2;
-  float x2;
-}box_corner_encoding;
-
-static float _computeIntersectionOverUnion
-    (
-    const float* boxes,
-    const int32_t i,
-    const int32_t j
-    )
-{
-  box_corner_encoding box_i = ((box_corner_encoding *)boxes)[i];
-  box_corner_encoding box_j = ((box_corner_encoding *)boxes)[j];
-  const float box_i_y_min = vsi_nn_min(box_i.y1, box_i.y2);
-  const float box_i_y_max = vsi_nn_max(box_i.y1, box_i.y2);
-  const float box_i_x_min = vsi_nn_min(box_i.x1, box_i.x2);
-  const float box_i_x_max = vsi_nn_max(box_i.x1, box_i.x2);
-  const float box_j_y_min = vsi_nn_min(box_j.y1, box_j.y2);
-  const float box_j_y_max = vsi_nn_max(box_j.y1, box_j.y2);
-  const float box_j_x_min = vsi_nn_min(box_j.x1, box_j.x2);
-  const float box_j_x_max = vsi_nn_max(box_j.x1, box_j.x2);
-
-  const float area_i =
-      (box_i_y_max - box_i_y_min) * (box_i_x_max - box_i_x_min);
-  const float area_j =
-      (box_j_y_max - box_j_y_min) * (box_j_x_max - box_j_x_min);
-  const float intersection_ymax = vsi_nn_min(box_i_y_max, box_j_y_max);
-  const float intersection_xmax = vsi_nn_min(box_i_x_max, box_j_x_max);
-  const float intersection_ymin = vsi_nn_max(box_i_y_min, box_j_y_min);
-  const float intersection_xmin = vsi_nn_max(box_i_x_min, box_j_x_min);
-  const float intersection_area =
-      vsi_nn_max(intersection_ymax - intersection_ymin, 0.0f) *
-      vsi_nn_max(intersection_xmax - intersection_xmin, 0.0f);
-
-  if (area_i <= 0 || area_j <= 0)
-  {
-      return 0.0f;
-  }
-
-  return intersection_area / (area_i + area_j - intersection_area);
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_kernel_tensor_t tensors[_INPUT_NUM] = { NULL };
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float * buffer[_INPUT_NUM] = { NULL };
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_size_t stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_size_t out_elements[_OUTPUT_NUM] = {0};
-    vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM] = { NULL };
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    int32_t i = 0;
-    int32_t num_boxes = 0;
-    float* boxes = NULL;
-    float* scores = NULL;
-    float* selected_indices = NULL;
-    float* selected_scores = NULL;
-    float* num_selected_indices = NULL;
-    Candidate * candidate = NULL;
-    int32_t select_size = 0;
-    int32_t max_output_size = 0;
-    int32_t select_start = 0;
-    int32_t select_len = 0;
-    float iou_threshold = 0.f;
-    float score_threshold = 0.f;
-    float soft_nms_sigma = 0.f;
-    float scale = 0;
-    int32_t num_outputs = 0;
-
-    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_MAX_SIZE],
-        &max_output_size);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_IOU_THRES],
-        &iou_threshold);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SCORE_THRES],
-        &score_threshold);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SOFT_NMS_SIGMA],
-        &soft_nms_sigma);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for ( i = 0;  i < _INPUT_NUM;  i++)
-    {
-        tensors[i]  = (vsi_nn_kernel_tensor_t)param[i];
-        attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
-
-        vsi_nn_kernel_tensor_attr_get_stride( attr[i], stride_size[i] );
-        buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( buffer[i], "Create input buffer fail.", final );
-    }
-
-    for ( i = 0;  i < _OUTPUT_NUM;  i++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        f32_out_buffer[i] = (float *)malloc( out_elements[i] * sizeof(float) );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_elements[i] * sizeof(float) );
-    }
-
-    num_boxes = (int32_t)attr[0]->shape->data[1];
-    boxes = buffer[0];
-    scores = buffer[1];
-    selected_indices = f32_out_buffer[0];
-    selected_scores = f32_out_buffer[1];
-    num_selected_indices = f32_out_buffer[2];
-
-    candidate = (Candidate*)malloc(num_boxes * sizeof(Candidate));
-    CHECK_PTR_FAIL_GOTO( candidate, "Create select buffer fail.", final );
-    memset(candidate, 0, num_boxes * sizeof(Candidate));
-
-    for (i = 0; i < num_boxes; ++i)
-    {
-        if (scores[i] > score_threshold)
-        {
-            candidate[select_size].index = i;
-            candidate[select_size].score = scores[i];
-            candidate[select_size].suppress_begin_index = 0;
-            select_size++;
-        }
-    }
-
-    num_outputs = vsi_nn_min(select_size, max_output_size);
-
-    if (num_outputs == 0)
-    {
-        num_selected_indices[0] = 0;
-    }
-
-    if (soft_nms_sigma > 0.0f)
-    {
-        scale = -0.5f / soft_nms_sigma;
-    }
-
-    select_len = 0;
-    while (select_len < num_outputs && select_start < select_size)
-    {
-        int32_t j = 0;
-        float original_score = 0;
-        vsi_bool should_hard_suppress = FALSE;
-
-        // find max score and swap to the front.
-        int32_t max_index = _max_element( &candidate[select_start], select_size - select_start);
-
-        if (max_index != select_size - select_start - 1)
-        {
-            _swap_element(&(candidate[select_start]), max_index, 0);
-        }
-
-        original_score = candidate[select_start].score;
-        // Calculate IoU of the rest, swap to the end (disgard) if needed.
-        for ( j = select_len - 1; j >= candidate[select_start].suppress_begin_index; j-- )
-        {
-            int32_t idx = (int32_t)selected_indices[j];
-            float iou = _computeIntersectionOverUnion(boxes, candidate[select_start].index, idx);
-
-            // First decide whether to perform hard suppression.
-            if (iou >= iou_threshold)
-            {
-                should_hard_suppress = TRUE;
-                break;
-            }
-
-            // Suppress score if NMS sigma > 0.
-            if (soft_nms_sigma > 0.0)
-            {
-                candidate[select_start].score =
-                    candidate[select_start].score * (float)exp(scale * iou * iou);
-            }
-
-            if (candidate[select_start].score <= score_threshold)
-                break;
-        }
-
-        candidate[select_start].suppress_begin_index = select_len;
-        if (!should_hard_suppress)
-        {
-            if (candidate[select_start].score == original_score)
-            {
-                // Suppression has not occurred, so select next_candidate.
-                selected_indices[select_len] = (float)candidate[select_start].index;
-                selected_scores[select_len] = candidate[select_start].score;
-                ++ select_len;
-            }
-            if ( candidate[select_start].score > score_threshold)
-            {
-                // Soft suppression might have occurred and current score is still
-                // greater than score_threshold; add next_candidate back onto priority
-                // queue.
-                candidate[select_start].suppress_begin_index = select_len;
-            }
-        }
-
-        select_start ++;
-    }
-
-    num_selected_indices[0] = (float)select_len;
-
-    for ( i = select_len; i < max_output_size; i++)
-    {
-        selected_indices[i] = 0;
-        selected_scores[i] = 0;
-    }
-
-    /* save data */
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    vsi_nn_safe_free(candidate);
-    for( i = 0; i < _INPUT_NUM; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-
-    for ( i = 0; i < _OUTPUT_NUM; i++ )
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _nms_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _nms_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_NMS_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t max_output_size = vsi_nn_kernel_param_get_int32(params, "max_output_size");
-    float iou_threshold = vsi_nn_kernel_param_get_float32(params, "iou_threshold");
-    float score_threshold = vsi_nn_kernel_param_get_float32(params, "score_threshold");
-    float soft_nms_sigma = vsi_nn_kernel_param_get_float32(params, "soft_nms_sigma");
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _NMS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            node_params[SCALAR_INPUT_MAX_SIZE] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &max_output_size );
-            node_params[SCALAR_INPUT_IOU_THRES] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &iou_threshold );
-            node_params[SCALAR_INPUT_SCORE_THRES] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &score_threshold );
-            node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &soft_nms_sigma );
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _NMS_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MAX_SIZE] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_IOU_THRES] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCORE_THRES] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( nms, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
deleted file mode 100644
index 5508499..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
- #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.one_hot")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _one_hot_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-#define INPUT_SCALAR_DEPTH          (2)
-#define INPUT_SCALAR_ON_VALUE       (3)
-#define INPUT_SCALAR_OFF_VALUE      (4)
-#define INPUT_SCALAR_AXIS           (5)
-#define _ONE_HOT_PARAM_NUM  _cnt_of_array( _one_hot_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
-    float * buffer[_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
-    vsi_size_t i = 0;
-    int32_t j = 0, m = 0;
-    vsi_size_t k = 0;
-    int32_t index = 0;
-    int32_t depth = 0;
-    float on_value = 0;
-    float off_value = 0;
-    int32_t axis = 0;
-    vsi_size_t prefix_dim_size = 1;
-    vsi_size_t suffix_dim_size = 0;
-    int32_t num_elements = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-
-    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &depth);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &on_value);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &off_value);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    num_elements = (int32_t)vsi_nn_kernel_tensor_attr_get_size( attr[0] );
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    axis = axis == -1 ? (int32_t)attr[0]->shape->size : (int32_t)attr[0]->shape->size - axis;
-
-    for (m = 0; m < axis; m++)
-    {
-        prefix_dim_size *= attr[0]->shape->data[m];
-    }
-
-    suffix_dim_size = num_elements / prefix_dim_size;
-
-    for (i = 0; i < prefix_dim_size; i++)
-    {
-        for (j = 0; j < depth; j++)
-        {
-            for (k = 0; k < suffix_dim_size; k++)
-            {
-                int32_t value = (int32_t)buffer[0][i * suffix_dim_size + k];
-                buffer[1][index ++] = value == j ? on_value : off_value;
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-final:
-#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
-    SAFE_FREE_TENSOR_ATTR(attr[0]);
-    SAFE_FREE_TENSOR_ATTR(attr[1]);
-#undef SAFE_FREE_TENSOR_ATTR
-    for ( i = 0; i < _IO_NUM; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-            buffer[i] = NULL;
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _one_hot_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _one_hot_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
-    float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
-    float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
-    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[INPUT_SCALAR_DEPTH] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &depth );
-            node_params[INPUT_SCALAR_ON_VALUE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &on_value );
-            node_params[INPUT_SCALAR_OFF_VALUE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &off_value );
-            node_params[INPUT_SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
-            CHECK_STATUS_FAIL_GOTO( status, OnError );
-        }
-    }
-OnError:
-    if (node_params[INPUT_SCALAR_DEPTH])
-    {
-        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_DEPTH] );
-    }
-
-    if (node_params[INPUT_SCALAR_ON_VALUE])
-    {
-        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_ON_VALUE] );
-    }
-
-    if (node_params[INPUT_SCALAR_OFF_VALUE])
-    {
-        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_OFF_VALUE] );
-    }
-
-    if (node_params[INPUT_SCALAR_AXIS])
-    {
-        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_AXIS] );
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( one_hot, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c
deleted file mode 100644
index 19a6e85..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c
+++ /dev/null
@@ -1,335 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (2)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.poolwithargmax")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _poolwithargmax_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-#define _POOLWITHARGMAX_PARAM_NUM  _cnt_of_array( _poolwithargmax_kernel_param_def )
-
-#define SCALAR_KSZIE_X          (3)
-#define SCALAR_KSZIE_Y          (4)
-#define SCALAR_STRIDE_X         (5)
-#define SCALAR_STRIDE_Y         (6)
-#define SCALAR_PAD_X            (7)
-#define SCALAR_PAD_Y            (8)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    vsi_size_t  i, j, b, p;
-    vsi_size_t  batch, depth_v, height_o, width_o, height, width;
-    int32_t  ksize_x     = 0;
-    int32_t  ksize_y     = 0;
-    int32_t  stride_x    = 0;
-    int32_t  stride_y    = 0;
-    int32_t  pad_x       = 0;
-    int32_t  pad_y       = 0;
-    vsi_size_t  output_base = 0;
-    vsi_size_t  input_base  = 0;
-    vsi_ssize_t  max_index   = 0;
-    vsi_nn_kernel_dtype_e out1_dtype;
-    vsi_bool is_relative_coord = FALSE;
-
-
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_KSZIE_X],  &ksize_x);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_KSZIE_Y],  &ksize_y);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_X], &stride_x);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_Y], &stride_y);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_PAD_X],    &pad_x);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_PAD_Y],    &pad_y);
-
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    batch    = out_attr[0]->shape->size > 3 ? out_attr[0]->shape->data[3] : 1;
-    depth_v  = out_attr[0]->shape->size > 2 ? out_attr[0]->shape->data[2] : 1;
-    height_o = out_attr[0]->shape->data[1];
-    width_o  = out_attr[0]->shape->data[0];
-    width    = in_attr[0]->shape->data[0];
-    height   = in_attr[0]->shape->data[1];
-
-    out1_dtype = out_attr[1]->dtype;
-
-    if ((I8 == out1_dtype) || (U8 == out1_dtype) || (I16 == out1_dtype))
-    {
-        is_relative_coord = TRUE;
-    }
-
-    for(b = 0; b < batch; b++)
-    {
-        for (p = 0; p < depth_v; p ++)
-        {
-            output_base = b * depth_v * height_o * width_o + p * height_o * width_o;
-            input_base  = b * depth_v * height * width + p * height * width;
-            for (j = 0; j < height_o; j ++)
-            {
-                for (i = 0; i < width_o; i ++)
-                {
-                    vsi_ssize_t hstart     = j * stride_y - pad_y;
-                    vsi_ssize_t wstart     = i * stride_x - pad_x;
-                    vsi_size_t hoffset    = 0;
-                    vsi_size_t woffset    = 0;
-                    vsi_size_t hend       = vsi_nn_min(hstart + ksize_y, (vsi_ssize_t)height);
-                    vsi_size_t wend       = vsi_nn_min(wstart + ksize_x, (vsi_ssize_t)width);
-                    vsi_size_t pool_index = 0;
-                    vsi_size_t h, w       = 0;
-                    vsi_size_t cur_index  = 0;
-                    float   d_f32      = 0.0f;
-
-                    if (hstart < 0)
-                    {
-                        hoffset = -hstart;
-                    }
-
-                    if (wstart < 0)
-                    {
-                        woffset = -wstart;
-                    }
-
-                    hstart = vsi_nn_max(hstart, 0);
-                    wstart = vsi_nn_max(wstart, 0);
-
-                    pool_index = output_base + j * width_o + i;
-                    max_index = is_relative_coord ? 0 : (input_base + hstart * width + wstart);
-                    d_f32     = f32_in_buffer[0][input_base + hstart * width + wstart];
-                    for (h = hstart; h < hend; ++ h)
-                    {
-                        cur_index = (h - hstart + hoffset) * ksize_x + woffset;
-                        for (w = wstart; w < wend; ++ w)
-                        {
-                            vsi_ssize_t index = input_base + h * width + w;
-                            float   d;
-
-                            d = f32_in_buffer[0][index];
-                            if (d > d_f32)
-                            {
-                                d_f32 = d;
-                                max_index = is_relative_coord ? cur_index : index;
-                            }
-                            cur_index++;
-                        }
-                    }
-                    f32_out_buffer[0][pool_index] = d_f32;
-                    f32_out_buffer[1][pool_index] = (float)max_index;
-                }
-            }
-        }
-    }
-    out_attr[1]->quant = VSI_NN_KERNEL_QUANT_NONE;
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _poolwithargmax_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _poolwithargmax_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_POOLWITHARGMAX_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t ksize_x  = 0;
-    int32_t ksize_y  = 0;
-    int32_t stride_x = 0;
-    int32_t stride_y = 0;
-    int32_t pad_x    = 0;
-    int32_t pad_y    = 0;
-
-    ksize_x  = vsi_nn_kernel_param_get_int32(params, "ksize_x");
-    ksize_y  = vsi_nn_kernel_param_get_int32(params, "ksize_y");
-    stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
-    stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
-    pad_x    = vsi_nn_kernel_param_get_int32(params, "pad_x");
-    pad_y    = vsi_nn_kernel_param_get_int32(params, "pad_y");
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _POOLWITHARGMAX_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_KSZIE_X] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &ksize_x );
-            node_params[SCALAR_KSZIE_Y] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &ksize_y );
-            node_params[SCALAR_STRIDE_X] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &stride_x );
-            node_params[SCALAR_STRIDE_Y] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &stride_y );
-            node_params[SCALAR_PAD_X]    = vsi_nn_kernel_scalar_create(
-                    graph, I32, &pad_x );
-            node_params[SCALAR_PAD_Y]    = vsi_nn_kernel_scalar_create(
-                    graph, I32, &pad_y );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _POOLWITHARGMAX_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_KSZIE_X] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_KSZIE_Y] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_X] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_Y] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_PAD_X] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_PAD_Y] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( poolwithargmax, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
deleted file mode 100644
index 39d53dd..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (0)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("pow_sw")
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i = 0;
-    vsi_ssize_t offset = 0;
-
-    for( i = 0; i < rank && index; i ++ )
-    {
-        if( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-    return offset;
-}
-
-DEF_KERNEL_EXECUTOR(_pow_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    vsi_size_t out_elements = 0;
-    vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] );
-    vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    for( i = 0; i < out_elements; i ++ )
-    {
-        vsi_ssize_t in0_offset = 0;
-        vsi_ssize_t in1_offset = 0;
-        float val1 = 0.f;
-        float val2 = 0.f;
-
-        in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size,
-                stride_size[0], attr[2]->shape->data );
-        in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size,
-                stride_size[1], attr[2]->shape->data );
-
-        val1 = buffer[0][in0_offset];
-        val2 = buffer[1][in1_offset];
-
-        buffer[2][i] = (float)pow( val1, val2 );
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pow_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _pow_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( pow, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
deleted file mode 100644
index bca6300..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
+++ /dev/null
@@ -1,371 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (10)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.pre_process_bgra_sw")
-
-#define DESCALE(x) (((x) + (1<<19)) >> 20)
-
-DEF_KERNEL_EXECUTOR(_pre_process_bgra_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    float * outBuffer = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
-    float rMean = 0, gMean = 0, bMean = 0, var = 0;
-    int32_t order = 0, trans = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    i = 2;
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    if(trans)
-    {
-        outBuffer = (float *)malloc( out_elements * sizeof(float) );
-        CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final );
-        memset( outBuffer, 0, out_elements * sizeof(float) );
-    }
-
-    {
-        int32_t elementSize = 4;
-        int32_t rline1[2], rline2[2];
-        int32_t gline1[2], gline2[2];
-        int32_t bline1[2], bline2[2];
-        int32_t dx = 0, dy = 0, dz = 0;
-        int32_t src_stride = (int32_t)attr[0]->shape->data[0];
-        int32_t src_width = (int32_t)(src_stride / elementSize);
-        int32_t src_height = (int32_t)attr[0]->shape->data[1];
-        int32_t dst_width = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]);
-        int32_t dst_height = (int32_t)(trans ? attr[1]->shape->data[2] : attr[1]->shape->data[1]);
-        int32_t stride = (int32_t)(dst_width * dst_height);
-        int32_t bOffset = 0;
-        int32_t gOffset = 1 * stride;
-        int32_t rOffset = 2 * stride;
-        uint8_t R = 0, G = 0, B = 0;
-
-        if(order)
-        {
-            bOffset = 2 * stride;
-            rOffset = 0;
-        }
-
-        for ( dz = 0; dz < 1; dz ++)
-        {
-            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
-            {
-                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
-                {
-                    int32_t source_index = 0;
-                    int32_t output_index = dx + dy * dst_width;
-                    int32_t dstR_idx = output_index + rOffset;
-                    int32_t dstG_idx = output_index + gOffset;
-                    int32_t dstB_idx = output_index + bOffset;
-                    float finalVal = 0;
-
-                    if(xRatio != (1 << 15) || yRatio != (1 << 15))
-                    {
-                        int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14);
-                        int32_t sx = fx & 0xffff8000; // Floor
-                        int32_t fy = 0, sy = 0;
-                        int32_t temp1 = 0, temp2 = 0;
-
-                        fx -= sx;
-                        sx = sx >> 15;
-
-                        sx = sx < 0 ? 0 : sx;
-                        sx = sx > src_width ? src_width - 1: sx;
-
-                        fx = (fx +(1 << 4)) >> 5;
-
-                        // for y
-                        fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14);
-                        sy = fy & 0xffff8000; // Floor
-                        fy -= sy;
-                        sy = sy >> 15;
-
-                        sy = sy < 0 ? 0 : sy;
-                        fy = fy < 0 ? 0 : fy;
-
-                        fy = (fy + (1<< 4)) >> 5;
-
-                        sx += xOffset;
-                        sy += yOffset;
-                        source_index = (sx + sy * src_width + dz * src_width * src_height) * elementSize;
-
-                        bline1[0] = (int32_t)buffer[0][source_index];
-                        bline1[1] = (int32_t)buffer[0][source_index + elementSize];
-                        bline2[0] = (int32_t)buffer[0][source_index + src_stride];
-                        bline2[1] = (int32_t)buffer[0][source_index + src_stride + elementSize];
-
-                        gline1[0] = (int32_t)buffer[0][source_index + 1];
-                        gline1[1] = (int32_t)buffer[0][source_index + elementSize + 1];
-                        gline2[0] = (int32_t)buffer[0][source_index + src_stride + 1];
-                        gline2[1] = (int32_t)buffer[0][source_index + src_stride + elementSize + 1];
-
-                        rline1[0] = (int32_t)buffer[0][source_index + 2];
-                        rline1[1] = (int32_t)buffer[0][source_index + elementSize + 2];
-                        rline2[0] = (int32_t)buffer[0][source_index + src_stride + 2];
-                        rline2[1] = (int32_t)buffer[0][source_index + src_stride + elementSize + 2];
-
-                        // B
-                        temp1 = fx * (bline1[1] - bline1[0]) + (bline1[0] << 10);
-                        temp2 = fx * (bline2[1] - bline2[0]) + (bline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        B = (uint8_t)(DESCALE(temp1));
-                        finalVal = (B - bMean) * var;
-                        buffer[1][dstB_idx] = finalVal;
-
-                        // R
-                        temp1 = fx * (rline1[1] - rline1[0]) + (rline1[0] << 10);
-                        temp2 = fx * (rline2[1] - rline2[0]) + (rline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        R = (uint8_t)(DESCALE(temp1));
-                        finalVal = (R - rMean) * var;
-                        buffer[1][dstR_idx] = finalVal;
-
-                        // G
-                        temp1 = fx * (gline1[1] - gline1[0]) + (gline1[0] << 10);
-                        temp2 = fx * (gline2[1] - gline2[0]) + (gline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        G = (uint8_t)(DESCALE(temp1));
-                        finalVal = (G - gMean) * var;
-                        buffer[1][dstG_idx] = finalVal;
-                    }
-                    else //copy
-                    {
-                        int32_t offset = xOffset + yOffset * src_width;
-                        source_index = (dx + dy * src_width + offset) * elementSize;
-
-                        finalVal = (buffer[0][source_index] - bMean) * var;
-                        buffer[1][dstB_idx] = finalVal;
-
-                        finalVal = (buffer[0][source_index + 1] - gMean) * var;
-                        buffer[1][dstG_idx] = finalVal;
-
-                        finalVal = (buffer[0][source_index + 2] - rMean) * var;
-                        buffer[1][dstR_idx] = finalVal;
-                    }
-                }
-            }
-        }
-    }
-
-    if(trans)
-    {
-        vsi_size_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1};
-        vsi_size_t perm[] = {1, 2, 0, 3};
-        vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1],
-                        shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32);
-
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            outBuffer, out_elements );
-    }
-    else
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-                buffer[1], out_elements );
-    }
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if(outBuffer)
-    {
-        free(outBuffer);
-    }
-
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_bgra_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _pre_process_bgra_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 2;
-            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
-            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
-            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
-            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
-            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
-            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
-            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float bgra_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
-            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
-            int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &bgra_scale );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[2] );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-            vsi_nn_kernel_scalar_release( &backend_params[6] );
-            vsi_nn_kernel_scalar_release( &backend_params[7] );
-            vsi_nn_kernel_scalar_release( &backend_params[8] );
-            vsi_nn_kernel_scalar_release( &backend_params[9] );
-            vsi_nn_kernel_scalar_release( &backend_params[10] );
-            vsi_nn_kernel_scalar_release( &backend_params[11] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( pre_process_bgra, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
deleted file mode 100644
index f7d4248..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (6)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.pre_process_gray_sw")
-
-#define DESCALE(x) (((x) + (1<<19)) >> 20)
-
-DEF_KERNEL_EXECUTOR(_pre_process_gray_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
-    float mean = 0, scale = 1;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    i = 2;
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &scale);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    {
-        int32_t line1[2], line2[2];
-        int32_t dx = 0, dy = 0, dz = 0;
-        int32_t src_width = (int32_t)attr[0]->shape->data[0];
-        int32_t src_height = (int32_t)attr[0]->shape->data[1];
-        int32_t dst_width = (int32_t)attr[1]->shape->data[0];
-        int32_t dst_height = (int32_t)attr[1]->shape->data[1];
-        uint8_t result = 0;
-
-        for ( dz = 0; dz < 1; dz ++)
-        {
-            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
-            {
-                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
-                {
-                    int32_t source_index = 0;
-                    int32_t output_index = dx + dy * dst_width;
-                    float finalVal = 0.0f;
-
-                    if(xRatio != (1 << 15) || yRatio != (1 << 15))
-                    {
-                        int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14);
-                        int32_t sx = fx & 0xffff8000; // Floor
-                        int32_t fy = 0, sy = 0;
-                        int32_t temp1 = 0;
-                        int32_t temp2 = 0;
-
-                        fx -= sx;
-                        sx = sx >> 15;
-
-                        sx = sx < 0 ? 0 : sx;
-                        sx = sx > src_width ? src_width - 1: sx;
-
-                        fx = (fx +(1 << 4)) >> 5;
-
-                        // for y
-                        fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14);
-                        sy = fy & 0xffff8000; // Floor
-                        fy -= sy;
-                        sy = sy >> 15;
-
-                        sy = sy < 0 ? 0 : sy;
-                        fy = fy < 0 ? 0 : fy;
-
-                        fy = (fy + (1<< 4)) >> 5;
-
-                        sx += xOffset;
-                        sy += yOffset;
-                        source_index = (sx + sy * src_width + dz * src_width * src_height);
-
-                        line1[0] = (int32_t)buffer[0][source_index];
-                        line1[1] = (int32_t)buffer[0][source_index + 1];
-                        line2[0] = (int32_t)buffer[0][source_index + src_width];
-                        line2[1] = (int32_t)buffer[0][source_index + src_width + 1];
-
-                        temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10);
-                        temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        result = (uint8_t)(DESCALE(temp1));
-                        finalVal = (result - mean) * scale;
-                        buffer[1][output_index] = finalVal;
-                    }
-                    else
-                    {
-                        int32_t offset = xOffset + yOffset * src_width;
-                        source_index = dx + dy * src_width + offset;
-                        finalVal = (buffer[0][source_index] - mean) * scale;
-                        buffer[1][output_index] = finalVal;
-                    }
-                }
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_gray_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _pre_process_gray_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 2;
-            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
-            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
-            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
-            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
-            float mean       = vsi_nn_kernel_param_get_float32( params, "mean" );
-            float scale      = vsi_nn_kernel_param_get_float32( params, "scale" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[2] );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-            vsi_nn_kernel_scalar_release( &backend_params[6] );
-            vsi_nn_kernel_scalar_release( &backend_params[7] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( pre_process_gray, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
deleted file mode 100644
index f9c47f9..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (10)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.pre_process_nv12_sw")
-
-#define DESCALE(x) (((x) + (1<<19)) >> 20)
-
-DEF_KERNEL_EXECUTOR(_pre_process_nv12_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    float * outBuffer = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
-    float rMean = 0, gMean = 0, bMean = 0, var = 0;
-    int32_t order = 0, trans = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    i = 3;
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    if(trans)
-    {
-        outBuffer = (float *)malloc( out_elements * sizeof(float) );
-        CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final );
-        memset( outBuffer, 0, out_elements * sizeof(float) );
-    }
-
-    {
-        int32_t dx, dy, dz;
-        int32_t src_width = (int32_t)attr[0]->shape->data[0];
-        int32_t dst_width = (int32_t)(trans ? attr[2]->shape->data[1] : attr[2]->shape->data[0]);
-        int32_t dst_height = (int32_t)(trans ? attr[2]->shape->data[2] : attr[2]->shape->data[1]);
-        int32_t stride = (int32_t)(dst_width * dst_height);
-        int32_t rOffset = 0;
-        int32_t gOffset = 1 * stride;
-        int32_t bOffset = 2 * stride;
-        float D, E;
-        float R, G, B;
-        float min = 0;
-        float max = 255;
-        float* src_y_slice = NULL;
-        float* src_uv_yScanline = NULL;
-
-        uint32_t roi_width = (xRatio * dst_width) >> 15;
-        uint32_t roi_height = (yRatio * dst_height) >> 15;
-        uint32_t xrIntFloat_16 = (roi_width << 16) / dst_width + 1;
-        uint32_t yrIntFloat_16 = (roi_height << 16) / dst_height + 1;
-        uint32_t srcy = 0, srcx = 0;
-
-        if(attr[2]->dtype == I8)
-        {
-            min = -128;
-            max = 127;
-        }
-        else if(attr[2]->dtype == I16 || attr[2]->dtype == F16)
-        {
-            min = -65536;
-            max = 65535;
-        }
-
-        if(order)
-        {
-            rOffset = 2 * stride;
-            bOffset = 0;
-        }
-
-        for ( dz = 0; dz < 1; dz ++)
-        {
-            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
-            {
-                srcy = (((uint32_t)dy * yrIntFloat_16) >> 16) + yOffset;
-                src_y_slice = buffer[0] + (srcy) * src_width;
-                src_uv_yScanline = buffer[1] + (srcy / 2) * src_width;
-
-                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
-                {
-                    float finalVal = 0;
-                    int32_t output_index = 0;
-                    int32_t dstR_idx = 0, dstG_idx = 0, dstB_idx = 0;
-                    float tmpY = 0.0f;
-                    float tmpU = 0.0f;
-                    float tmpV = 0.0f;
-
-                    srcx = (((uint32_t)dx * xrIntFloat_16) >> 16) + xOffset;
-                    tmpY = src_y_slice[srcx];
-                    tmpU = src_uv_yScanline[(srcx / 2) * 2];
-                    tmpV = src_uv_yScanline[(srcx / 2) * 2 + 1];
-
-                    D = (tmpU - 128);
-                    E = (tmpV - 128);
-
-                    // B
-                    B = (float)vsi_clamp((tmpY + (1.7790 * D)), min, max);
-                    //G
-                    G = (float)vsi_clamp((tmpY - 0.3455 * D - 0.7169 * E), min, max);
-                    //R
-                    R = (float)vsi_clamp((tmpY + 1.4065 * E), min, max);
-
-                    output_index = dx + dy * dst_width;
-
-                    dstR_idx = output_index + rOffset;
-                    dstG_idx = output_index + gOffset;
-                    dstB_idx = output_index + bOffset;
-
-                    finalVal = (B - bMean) * var;
-                    buffer[2][dstB_idx] = finalVal;
-
-                    finalVal = (G - gMean) * var;
-                    buffer[2][dstG_idx] = finalVal;
-
-                    finalVal = (R - rMean) * var;
-                    buffer[2][dstR_idx] = finalVal;
-                }
-            }
-        }
-    }
-
-    if(trans)
-    {
-        vsi_size_t shape[] = {attr[2]->shape->data[0], attr[2]->shape->data[1], attr[2]->shape->data[2], 1};
-        vsi_size_t perm[] = {1, 2, 0, 3};
-        vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[2],
-                        shape, (uint32_t)attr[2]->shape->size, perm, VSI_NN_TYPE_FLOAT32);
-
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            outBuffer, out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-    else
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-                buffer[2], out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    if(outBuffer)
-    {
-        free(outBuffer);
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_nv12_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _pre_process_nv12_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 3;
-            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
-            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
-            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
-            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
-            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
-            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
-            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
-            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
-            int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-            vsi_nn_kernel_scalar_release( &backend_params[6] );
-            vsi_nn_kernel_scalar_release( &backend_params[7] );
-            vsi_nn_kernel_scalar_release( &backend_params[8] );
-            vsi_nn_kernel_scalar_release( &backend_params[9] );
-            vsi_nn_kernel_scalar_release( &backend_params[10] );
-            vsi_nn_kernel_scalar_release( &backend_params[11] );
-            vsi_nn_kernel_scalar_release( &backend_params[12] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( pre_process_nv12, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
deleted file mode 100644
index 845c167..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM        (8)
-#define _CPU_INPUT_NUM      (3)
-#define _CPU_OUTPUT_NUM     (3)
-#define _CPU_IO_NUM         (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM      (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.pre_process_rgb888_planar")
-
-#define DESCALE(x) (((x) + (1<<19)) >> 20)
-/*
- * Kernel params
- */
-static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM  _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
-    float mean[3] = {0}, scale = 1;
-    vsi_bool is_rgb888 = tensors[1] == NULL;
-
-    for (i = 0; i < _CPU_IO_NUM; i++)
-    {
-        tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
-        if (tensors[i])
-        {
-            attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
-            CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
-        }
-    }
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    i = 6;
-    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean[0]);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean[1]);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &mean[2]);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &scale);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for (i = 0; i < 3; i++)
-    {
-        if (tensors[i])
-        {
-            buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
-            CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
-        }
-
-        buffer[i + 3] = (float *)malloc( out_elements * sizeof(float) );
-        CHECK_PTR_FAIL_GOTO( buffer[i + 3], "Create output buffer fail.", final );
-        memset( buffer[i + 3], 0, out_elements * sizeof(float) );
-    }
-
-    {
-        int32_t line1[2], line2[2];
-        int32_t dx = 0, dy = 0, idx = 0;
-        int32_t src_width = (int32_t)attr[0]->shape->data[0];
-        int32_t src_height = (int32_t)attr[0]->shape->data[1];
-        int32_t dst_width = (int32_t)attr[3]->shape->data[0];
-        int32_t dst_height = (int32_t)attr[3]->shape->data[1];
-        uint8_t result = 0;
-        int32_t offset = 0;
-        int32_t index = 0;
-
-        for ( idx = 0; idx < 3; idx ++)
-        {
-            offset = is_rgb888 ? idx * src_width * src_height : 0;
-            index = is_rgb888 ? 0 : idx;
-            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
-            {
-                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
-                {
-                    int32_t source_index = 0;
-                    int32_t output_index = dx + dy * dst_width;
-                    float finalVal = 0.0f;
-
-                    if(xRatio != (1 << 15) || yRatio != (1 << 15))
-                    {
-                        int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14);
-                        int32_t sx = fx & 0xffff8000; // Floor
-                        int32_t fy = 0, sy = 0;
-                        int32_t temp1 = 0;
-                        int32_t temp2 = 0;
-
-                        fx -= sx;
-                        sx = sx >> 15;
-
-                        sx = sx < 0 ? 0 : sx;
-                        sx = sx > src_width ? src_width - 1: sx;
-
-                        fx = (fx +(1 << 4)) >> 5;
-
-                        // for y
-                        fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14);
-                        sy = fy & 0xffff8000; // Floor
-                        fy -= sy;
-                        sy = sy >> 15;
-
-                        sy = sy < 0 ? 0 : sy;
-                        fy = fy < 0 ? 0 : fy;
-
-                        fy = (fy + (1<< 4)) >> 5;
-
-                        sx += xOffset;
-                        sy += yOffset;
-                        source_index = (sx + sy * src_width);
-
-                        line1[0] = (int32_t)buffer[index][source_index + offset];
-                        line1[1] = (int32_t)buffer[index][source_index + 1 + offset];
-                        line2[0] = (int32_t)buffer[index][source_index + src_width + offset];
-                        line2[1] = (int32_t)buffer[index][source_index + src_width + 1 + offset];
-
-                        temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10);
-                        temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        result = (uint8_t)(DESCALE(temp1));
-                        finalVal = (result - mean[idx]) * scale;
-                        buffer[idx + 3][output_index] = finalVal;
-                    }
-                    else
-                    {
-                        int32_t ofset = xOffset + yOffset * src_width;
-                        source_index = dx + dy * src_width + ofset + offset;
-                        finalVal = (buffer[index][source_index] - mean[idx]) * scale;
-                        buffer[idx + 3][output_index] = finalVal;
-                    }
-                }
-            }
-        }
-    }
-    for (i = 3; i < _CPU_IO_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[i], attr[i],
-                buffer[i], out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for ( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _pre_process_rgb888_planar_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_PRE_PROCESS_RGB888_PLANAR_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        uint32_t index = 6;
-        int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
-        int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
-        int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
-        int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
-        float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
-        float g_mean = vsi_nn_kernel_param_get_float32( params, "g_mean" );
-        float b_mean = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-        float scale      = vsi_nn_kernel_param_get_float32( params, "scale" );
-
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[6] );
-            vsi_nn_kernel_scalar_release( &node_params[7] );
-            vsi_nn_kernel_scalar_release( &node_params[8] );
-            vsi_nn_kernel_scalar_release( &node_params[9] );
-            vsi_nn_kernel_scalar_release( &node_params[10] );
-            vsi_nn_kernel_scalar_release( &node_params[11] );
-            vsi_nn_kernel_scalar_release( &node_params[12] );
-            vsi_nn_kernel_scalar_release( &node_params[13] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( pre_process_rgb888_planar, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
deleted file mode 100644
index 16068b6..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
+++ /dev/null
@@ -1,370 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (10)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.pre_process_rgb_sw")
-
-#define DESCALE(x) (((x) + (1<<19)) >> 20)
-
-DEF_KERNEL_EXECUTOR(_pre_process_rgb_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    float * outBuffer = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
-    float rMean = 0, gMean = 0, bMean = 0, var = 0;
-    int32_t order = 0, trans = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    i = 2;
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    if(trans)
-    {
-        outBuffer = (float *)malloc( out_elements * sizeof(float) );
-        CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final );
-        memset( outBuffer, 0, out_elements * sizeof(float) );
-    }
-
-    {
-        int32_t rline1[2], rline2[2];
-        int32_t gline1[2], gline2[2];
-        int32_t bline1[2], bline2[2];
-        int32_t dx = 0, dy = 0, dz = 0;
-        int32_t src_stride = (int32_t)attr[0]->shape->data[0];
-        int32_t src_width = (int32_t)(src_stride / 3);
-        int32_t src_height = (int32_t)attr[0]->shape->data[1];
-        int32_t dst_width = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]);
-        int32_t dst_height = (int32_t)(trans ? attr[1]->shape->data[2] : attr[1]->shape->data[1]);
-        int32_t stride = (int32_t)(dst_width * dst_height);
-        int32_t rOffset = 0;
-        int32_t gOffset = 1 * stride;
-        int32_t bOffset = 2 * stride;
-        uint8_t R = 0, G = 0, B = 0;
-
-        if(order)
-        {
-            rOffset = 2 * stride;
-            bOffset = 0;
-        }
-
-        for ( dz = 0; dz < 1; dz ++)
-        {
-            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
-            {
-                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
-                {
-                    int32_t source_index = 0;
-                    int32_t output_index = dx + dy * dst_width;
-                    int32_t dstR_idx = output_index + rOffset;
-                    int32_t dstG_idx = output_index + gOffset;
-                    int32_t dstB_idx = output_index + bOffset;
-                    float finalVal = 0;
-
-                    if(xRatio != (1 << 15) || yRatio != (1 << 15))
-                    {
-                        int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14);
-                        int32_t sx = fx & 0xffff8000; // Floor
-                        int32_t fy = 0, sy = 0;
-                        int32_t temp1 = 0, temp2 = 0;
-
-                        fx -= sx;
-                        sx = sx >> 15;
-
-                        sx = sx < 0 ? 0 : sx;
-                        sx = sx > src_width ? src_width - 1: sx;
-
-                        fx = (fx +(1 << 4)) >> 5;
-
-                        // for y
-                        fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14);
-                        sy = fy & 0xffff8000; // Floor
-                        fy -= sy;
-                        sy = sy >> 15;
-
-                        sy = sy < 0 ? 0 : sy;
-                        fy = fy < 0 ? 0 : fy;
-
-                        fy = (fy + (1<< 4)) >> 5;
-
-                        sx += xOffset;
-                        sy += yOffset;
-                        source_index = (sx + sy * src_width + dz * src_width * src_height) * 3;
-
-                        rline1[0] = (int32_t)buffer[0][source_index];
-                        rline1[1] = (int32_t)buffer[0][source_index + 3];
-                        rline2[0] = (int32_t)buffer[0][source_index + src_stride];
-                        rline2[1] = (int32_t)buffer[0][source_index + src_stride + 3];
-
-                        gline1[0] = (int32_t)buffer[0][source_index + 1];
-                        gline1[1] = (int32_t)buffer[0][source_index + 4];
-                        gline2[0] = (int32_t)buffer[0][source_index + src_stride + 1];
-                        gline2[1] = (int32_t)buffer[0][source_index + src_stride + 4];
-
-                        bline1[0] = (int32_t)buffer[0][source_index + 2];
-                        bline1[1] = (int32_t)buffer[0][source_index + 5];
-                        bline2[0] = (int32_t)buffer[0][source_index + src_stride + 2];
-                        bline2[1] = (int32_t)buffer[0][source_index + src_stride + 5];
-
-                        // R
-                        temp1 = fx * (rline1[1] - rline1[0]) + (rline1[0] << 10);
-                        temp2 = fx * (rline2[1] - rline2[0]) + (rline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        R = (uint8_t)(DESCALE(temp1));
-                        finalVal = (R - rMean) * var;
-                        buffer[1][dstR_idx] = finalVal;
-
-                        //G
-                        temp1 = fx * (gline1[1] - gline1[0]) + (gline1[0] << 10);
-                        temp2 = fx * (gline2[1] - gline2[0]) + (gline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        G = (uint8_t)(DESCALE(temp1));
-                        finalVal = (G - gMean) * var;
-                        buffer[1][dstG_idx] = finalVal;
-
-                        //B
-                        temp1 = fx * (bline1[1] - bline1[0]) + (bline1[0] << 10);
-                        temp2 = fx * (bline2[1] - bline2[0]) + (bline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        B = (uint8_t)(DESCALE(temp1));
-                        finalVal = (B - bMean) * var;
-                        buffer[1][dstB_idx] = finalVal;
-                    }
-                    else //copy
-                    {
-                        int32_t offset = xOffset + yOffset * src_width;
-                        source_index = (dx + dy * src_width + offset) * 3;
-
-                        finalVal = (buffer[0][source_index] - rMean) * var;
-                        buffer[1][dstR_idx] = finalVal;
-
-                        finalVal = (buffer[0][source_index + 1] - gMean) * var;
-                        buffer[1][dstG_idx] = finalVal;
-
-                        finalVal = (buffer[0][source_index + 2] - bMean) * var;
-                        buffer[1][dstB_idx] = finalVal;
-                    }
-                }
-            }
-        }
-    }
-
-    if(trans)
-    {
-        vsi_size_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1};
-        vsi_size_t perm[] = {1, 2, 0, 3};
-        vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1],
-                        shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32);
-
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            outBuffer, out_elements );
-    }
-    else
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-                buffer[1], out_elements );
-    }
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if(outBuffer)
-    {
-        free(outBuffer);
-    }
-
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_rgb_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _pre_process_rgb_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 2;
-            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
-            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
-            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
-            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
-            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
-            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
-            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
-            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
-            int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[2] );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-            vsi_nn_kernel_scalar_release( &backend_params[6] );
-            vsi_nn_kernel_scalar_release( &backend_params[7] );
-            vsi_nn_kernel_scalar_release( &backend_params[8] );
-            vsi_nn_kernel_scalar_release( &backend_params[9] );
-            vsi_nn_kernel_scalar_release( &backend_params[10] );
-            vsi_nn_kernel_scalar_release( &backend_params[11] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( pre_process_rgb, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
deleted file mode 100644
index aa814f2..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
+++ /dev/null
@@ -1,419 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (10)
-#define _CPU_INPUT_NUM          (3)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.pre_process_yuv420_sw")
-
-#define DESCALE(x) (((x) + (1<<19)) >> 20)
-
-DEF_KERNEL_EXECUTOR(_pre_process_yuv420_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    float * outBuffer = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
-    float rMean = 0, gMean = 0, bMean = 0, var = 0;
-    int32_t order = 0, trans = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    i = 4;
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input2 buffer fail.", final );
-
-    buffer[3] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
-    memset( buffer[3], 0, out_elements * sizeof(float) );
-
-    if(trans)
-    {
-        outBuffer = (float *)malloc( out_elements * sizeof(float) );
-        CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final );
-        memset( outBuffer, 0, out_elements * sizeof(float) );
-    }
-
-    {
-        uint8_t rline1[2], rline2[2];
-        uint8_t gline1[2], gline2[2];
-        uint8_t bline1[2], bline2[2];
-        int32_t dx, dy, dz;
-        int32_t src_width = (int32_t)attr[0]->shape->data[0];
-        int32_t src_height = (int32_t)attr[0]->shape->data[1];
-        int32_t subWidth = src_width >> 1;
-        int32_t subHeight = src_height >> 1;
-        int32_t dst_width = (int32_t)(trans ? attr[3]->shape->data[1] : attr[3]->shape->data[0]);
-        int32_t dst_height = (int32_t)(trans ? attr[3]->shape->data[2] : attr[3]->shape->data[1]);
-        int32_t stride = dst_width * dst_height;
-        int32_t rOffset = 0;
-        int32_t gOffset = 1 * stride;
-        int32_t bOffset = 2 * stride;
-        int32_t subIdx = 0;
-        int32_t C, D, E;
-        uint8_t R, G, B;
-        int32_t min = 0;
-        int32_t max = 255;
-
-        if(order)
-        {
-            rOffset = 2 * stride;
-            bOffset = 0;
-        }
-
-        for ( dz = 0; dz < 1; dz ++)
-        {
-            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
-            {
-                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
-                {
-                    int32_t source_index = 0;
-                    int32_t output_index = dx + dy * dst_width;
-                    int32_t dstR_idx = output_index + rOffset;
-                    int32_t dstG_idx = output_index + gOffset;
-                    int32_t dstB_idx = output_index + bOffset;
-                    float finalVal = 0;
-
-                    if(xRatio != (1 << 15) || yRatio != (1 << 15))
-                    {
-                        int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14);
-                        int32_t sx = fx & 0xffff8000; // Floor
-                        int32_t fy = 0, sy = 0;
-                        int32_t temp1 = 0, temp2 = 0;
-
-                        fx -= sx;
-                        sx = sx >> 15;
-
-                        sx = sx < 0 ? 0 : sx;
-                        sx = sx > src_width ? src_width - 1: sx;
-
-                        fx = (fx +(1 << 4)) >> 5;
-
-                        // for y
-                        fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14);
-                        sy = fy & 0xffff8000; // Floor
-                        fy -= sy;
-                        sy = sy >> 15;
-
-                        sy = sy < 0 ? 0 : sy;
-                        fy = fy < 0 ? 0 : fy;
-
-                        fy = (fy + (1<< 4)) >> 5;
-
-                        sx += xOffset;
-                        sy += yOffset;
-                        source_index = (sx + sy * src_width + dz * src_width * src_height + 0);
-                        subIdx = ((sx >> 1) + (sy >> 1) * subWidth + dz * subWidth * subHeight + 0);
-
-                        /*C = ySrc[source_index] - 16;
-                        D = uSrc[subIdx] - 128;
-                        E = vSrc[subIdx] - 128;*/
-                        C = (int)buffer[0][source_index] - 16;
-                        D = (int)buffer[1][subIdx] - 128;
-                        E = (int)buffer[2][subIdx] - 128;
-
-                        rline1[0]            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        gline1[0]            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        bline1[0]            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        // right
-                        subIdx = (((sx + 1) >> 1) + (sy >> 1) * subWidth + dz * subWidth * subHeight);
-                        C = (int)buffer[0][source_index + 1] - 16;
-                        D = (int)buffer[1][subIdx] - 128;
-                        E = (int)buffer[2][subIdx] - 128;
-
-                        rline1[1]            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        gline1[1]            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        bline1[1]            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        // below
-                        subIdx = (((sx + 0) >> 1) + ((sy + 1) >> 1) * subWidth + dz * subWidth * subHeight);
-                        C = (int)buffer[0][source_index + src_width] - 16;
-                        D = (int)buffer[1][subIdx] - 128;
-                        E = (int)buffer[2][subIdx] - 128;
-
-                        rline2[0]            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        gline2[0]            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        bline2[0]            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        // below right
-                        //C = ySrc[source_index + src_width + 1] - 16;
-                        subIdx = (((sx + 1) >> 1) + ((sy + 1) >> 1) * subWidth + dz * subWidth * subHeight);
-                        C = (int)buffer[0][source_index + src_width + 1] - 16;
-                        D = (int)buffer[1][subIdx] - 128;
-                        E = (int)buffer[2][subIdx] - 128;
-
-                        rline2[1]            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        gline2[1]            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        bline2[1]            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        //B
-                        temp1 = fx * (bline1[1] - bline1[0]) + (bline1[0] << 10);
-                        temp2 = fx * (bline2[1] - bline2[0]) + (bline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        B = (uint8_t)(DESCALE(temp1));
-                        finalVal = (B - bMean) * var;
-                        buffer[3][dstB_idx] = finalVal;
-
-                        //G
-                        temp1 = fx * (gline1[1] - gline1[0]) + (gline1[0] << 10);
-                        temp2 = fx * (gline2[1] - gline2[0]) + (gline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-
-                        G = (uint8_t)(DESCALE(temp1));
-                        finalVal = (G - gMean) * var;
-                        buffer[3][dstG_idx] = finalVal;
-
-                        // R
-                        temp1 = fx * (rline1[1] - rline1[0]) + (rline1[0] << 10);
-                        temp2 = fx * (rline2[1] - rline2[0]) + (rline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        R = (uint8_t)(DESCALE(temp1));
-                        finalVal = (R - rMean) * var;
-                        buffer[3][dstR_idx] = finalVal;
-                    }
-                    else
-                    {
-                        // do conversion
-                        C = (int)buffer[0][source_index] - 16;
-                        D = (int)buffer[1][subIdx] - 128;
-                        E = (int)buffer[2][subIdx] - 128;
-
-                        R            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        G            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        B            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        buffer[3][dstB_idx] = (B - bMean) * var;
-                        buffer[3][dstG_idx] = (G - gMean) * var;
-                        buffer[3][dstR_idx] = (R - rMean) * var;
-                    }
-                }
-            }
-        }
-    }
-
-    if(trans)
-    {
-        vsi_size_t shape[] = {attr[3]->shape->data[0], attr[3]->shape->data[1], attr[3]->shape->data[2], 1};
-        vsi_size_t perm[] = {1, 2, 0, 3};
-        vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[3],
-            shape, (uint32_t)attr[3]->shape->size, perm, VSI_NN_TYPE_FLOAT32);
-
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            outBuffer, out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-    else
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            buffer[3], out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    if(outBuffer)
-    {
-        free(outBuffer);
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_yuv420_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _pre_process_yuv420_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 4;
-            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
-            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
-            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
-            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
-            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
-            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
-            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
-            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
-            int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-            vsi_nn_kernel_scalar_release( &backend_params[6] );
-            vsi_nn_kernel_scalar_release( &backend_params[7] );
-            vsi_nn_kernel_scalar_release( &backend_params[8] );
-            vsi_nn_kernel_scalar_release( &backend_params[9] );
-            vsi_nn_kernel_scalar_release( &backend_params[10] );
-            vsi_nn_kernel_scalar_release( &backend_params[11] );
-            vsi_nn_kernel_scalar_release( &backend_params[12] );
-            vsi_nn_kernel_scalar_release( &backend_params[13] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( pre_process_yuv420, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c
deleted file mode 100644
index 189ef8f..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (11)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.pre_process_yuv422_sw")
-
-#define DESCALE(x) (((x) + (1<<19)) >> 20)
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-
-DEF_KERNEL_EXECUTOR(_pre_process_yuv422_exec)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    float * outBuffer = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
-    float rMean = 0, gMean = 0, bMean = 0, var = 0;
-    int32_t order = 0, trans = 0, yuv422_type = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    i = 2;
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yuv422_type);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    if(trans)
-    {
-        outBuffer = (float *)malloc( out_elements * sizeof(float) );
-        CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final );
-        memset( outBuffer, 0, out_elements * sizeof(float) );
-    }
-
-    {
-        int32_t dx, dy, dz;
-        int32_t src_width = (int32_t)attr[0]->shape->data[0];
-        int32_t dst_width = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]);
-        int32_t dst_height = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[1]);
-        int32_t stride = (int32_t)(dst_width * dst_height);
-        int32_t rOffset = 0;
-        int32_t gOffset = 1 * stride;
-        int32_t bOffset = 2 * stride;
-        float D0, D1, E0, E1;
-        float R0, G0, B0, R1, G1, B1;
-        float min = 0;
-        float max = 255;
-        float* src_y_slice = NULL;
-
-        uint32_t roi_width = (xRatio * dst_width) >> 15;
-        uint32_t roi_height = (yRatio * dst_height) >> 15;
-        uint32_t xrIntFloat_16 = (roi_width << 16) / dst_width + 1;
-        uint32_t yrIntFloat_16 = (roi_height << 16) / dst_height + 1;
-        uint32_t srcy = 0, srcx = 0;
-
-        if(attr[1]->dtype == I8)
-        {
-            min = -128;
-            max = 127;
-        }
-        else if(attr[1]->dtype == I16 || attr[1]->dtype == F16)
-        {
-            min = -32768;
-            max = 32767;
-        }
-
-        if(order)
-        {
-            rOffset = 2 * stride;
-            bOffset = 0;
-        }
-
-        for ( dz = 0; dz < 1; dz ++)
-        {
-            for ( dy = 0; dy < (int32_t)dst_height; dy++)
-            {
-                srcy = (((uint32_t)dy * yrIntFloat_16) >> 16) + yOffset;
-                src_y_slice = buffer[0] + (srcy) * src_width;
-                for ( dx = 0; dx < (int32_t)dst_width; dx += 2)
-                {
-                    int32_t output_index = 0;
-                    int32_t dstR_idx = 0, dstG_idx = 0, dstB_idx = 0;
-                    float tmpY0 = 0.0f;
-                    float tmpY1 = 0.0f;
-                    float tmpU0 = 0.0f;
-                    float tmpU1 = 0.0f;
-                    float tmpV0 = 0.0f;
-                    float tmpV1 = 0.0f;
-
-                    srcx = ((((uint32_t)dx * xrIntFloat_16) >> 16) + xOffset) * 2;
-
-                    if (xrIntFloat_16 >> 16 == 1)
-                    {
-                        if (yuv422_type == 1)
-                        {
-                            tmpY0 = src_y_slice[srcx + 1];
-                            tmpU0 = src_y_slice[srcx];
-                            tmpY1 = src_y_slice[srcx + 3];
-                            tmpV0 = src_y_slice[srcx + 2];
-                            tmpU1 = tmpU0;
-                            tmpV1 = tmpV0;
-                        }
-                        else
-                        {
-                            tmpY0 = src_y_slice[srcx];
-                            tmpU0 = src_y_slice[srcx + 1];
-                            tmpY1 = src_y_slice[srcx + 2];
-                            tmpV0 = src_y_slice[srcx + 3];
-                            tmpU1 = tmpU0;
-                            tmpV1 = tmpV0;
-                        }
-                    }
-                    else
-                    {
-                        if (yuv422_type == 1)
-                        {
-                            tmpY0 = src_y_slice[srcx + 1];
-                            tmpU0 = src_y_slice[(srcx / 4) * 4];
-                            tmpV0 = src_y_slice[(srcx / 4) * 4 + 2];
-                            srcx = (((uint32_t)(dx + 1) * xrIntFloat_16) >> 16) + xOffset;
-                            srcx = srcx * 2;
-                            tmpY1 = src_y_slice[srcx + 1];
-                            tmpU1 = src_y_slice[(srcx / 4) * 4];
-                            tmpV1 = src_y_slice[(srcx / 4) * 4 + 2];
-                        }
-                        else
-                        {
-                            tmpY0 = src_y_slice[srcx];
-                            tmpU0 = src_y_slice[(srcx / 4) * 4 + 1];
-                            tmpV0 = src_y_slice[(srcx / 4) * 4 + 3];
-                            srcx = (((uint32_t)(dx + 1) * xrIntFloat_16) >> 16) + xOffset;
-                            srcx = srcx * 2;
-                            tmpY1 = src_y_slice[srcx];
-                            tmpU1 = src_y_slice[(srcx / 4) * 4 + 1];
-                            tmpV1 = src_y_slice[(srcx / 4) * 4 + 3];
-                        }
-                    }
-
-                    D0 = (tmpU0 - 128);
-                    E0 = (tmpV0 - 128);
-                    D1 = (tmpU1 - 128);
-                    E1 = (tmpV1 - 128);
-
-                    B0 = (float)vsi_clamp((tmpY0 + (1.7790 * D0)), min, max);
-                    G0 = (float)vsi_clamp((tmpY0 - 0.3455 * D0 - 0.7169 * E0), min, max);
-                    R0 = (float)vsi_clamp((tmpY0 + 1.4065 * E0), min, max);
-
-                    B1 = (float)vsi_clamp((tmpY1 + (1.7790 * D1)), min, max);
-                    G1 = (float)vsi_clamp((tmpY1 - 0.3455 * D1 - 0.7169 * E1), min, max);
-                    R1 = (float)vsi_clamp((tmpY1 + 1.4065 * E1), min, max);
-
-                    output_index = dx + dy * dst_width;
-
-                    dstR_idx = output_index + rOffset;
-                    dstG_idx = output_index + gOffset;
-                    dstB_idx = output_index + bOffset;
-
-                    buffer[1][dstB_idx] = (B0 - bMean) * var;
-                    buffer[1][dstG_idx] = (G0 - gMean) * var;
-                    buffer[1][dstR_idx] = (R0 - rMean) * var;
-
-                    dstR_idx += 1;
-                    dstG_idx += 1;
-                    dstB_idx += 1;
-
-                    buffer[1][dstB_idx] = (B1 - bMean) * var;
-                    buffer[1][dstG_idx] = (G1 - gMean) * var;
-                    buffer[1][dstR_idx] = (R1 - rMean) * var;
-                }
-            }
-        }
-    }
-
-    if(trans)
-    {
-        vsi_size_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1};
-        vsi_size_t perm[] = {1, 2, 0, 3};
-        vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1],
-                        shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32);
-
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            outBuffer, out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-    else
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-                buffer[1], out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    if(outBuffer)
-    {
-        free(outBuffer);
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_yuv422_exec() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _pre_process_yuv422_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_CPU_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    status = _query_kernel( kernel, inputs, outputs);
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            uint32_t index = 2;
-            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
-            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
-            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
-            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
-            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
-            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
-            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
-            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
-            int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
-            int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &node_params[2] );
-            vsi_nn_kernel_scalar_release( &node_params[3] );
-            vsi_nn_kernel_scalar_release( &node_params[4] );
-            vsi_nn_kernel_scalar_release( &node_params[5] );
-            vsi_nn_kernel_scalar_release( &node_params[6] );
-            vsi_nn_kernel_scalar_release( &node_params[7] );
-            vsi_nn_kernel_scalar_release( &node_params[8] );
-            vsi_nn_kernel_scalar_release( &node_params[9] );
-            vsi_nn_kernel_scalar_release( &node_params[10] );
-            vsi_nn_kernel_scalar_release( &node_params[11] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( pre_process_yuv422, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
deleted file mode 100644
index 007d9c8..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
+++ /dev/null
@@ -1,413 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (10)
-#define _CPU_INPUT_NUM          (3)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.pre_process_yuv444_sw")
-
-#define DESCALE(x) (((x) + (1<<19)) >> 20)
-
-DEF_KERNEL_EXECUTOR(_pre_process_yuv444_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    float * outBuffer = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
-    float rMean = 0, gMean = 0, bMean = 0, var = 0;
-    int32_t order = 0, trans = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    i = 4;
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input2 buffer fail.", final );
-
-    buffer[3] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
-    memset( buffer[3], 0, out_elements * sizeof(float) );
-
-    if(trans)
-    {
-        outBuffer = (float *)malloc( out_elements * sizeof(float) );
-        CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final );
-        memset( outBuffer, 0, out_elements * sizeof(float) );
-    }
-
-    {
-        uint8_t rline1[2], rline2[2];
-        uint8_t gline1[2], gline2[2];
-        uint8_t bline1[2], bline2[2];
-        int32_t dx, dy, dz;
-        int32_t src_width = (int32_t)attr[0]->shape->data[0];
-        int32_t src_height = (int32_t)attr[0]->shape->data[1];
-        int32_t dst_width = (int32_t)(trans ? attr[3]->shape->data[1] : attr[3]->shape->data[0]);
-        int32_t dst_height = (int32_t)(trans ? attr[3]->shape->data[2] : attr[3]->shape->data[1]);
-        int32_t stride = dst_width * dst_height;
-        int32_t rOffset = 0;
-        int32_t gOffset = 1 * stride;
-        int32_t bOffset = 2 * stride;
-        int32_t C, D, E;
-        uint8_t R, G, B;
-        int32_t min = 0;
-        int32_t max = 255;
-
-        if(order)
-        {
-            rOffset = 2 * stride;
-            bOffset = 0;
-        }
-
-        for ( dz = 0; dz < 1; dz ++)
-        {
-            for ( dy = 0; dy < (int32_t)dst_height; dy ++)
-            {
-                for ( dx = 0; dx < (int32_t)dst_width; dx ++)
-                {
-                    int32_t source_index = 0;
-                    int32_t output_index = dx + dy * dst_width;
-                    int32_t dstR_idx = output_index + rOffset;
-                    int32_t dstG_idx = output_index + gOffset;
-                    int32_t dstB_idx = output_index + bOffset;
-                    float finalVal = 0;
-
-                    if(xRatio != (1 << 15) || yRatio != (1 << 15))
-                    {
-                        int32_t fx = (dx * xRatio + (xRatio >> 1)) - (1 << 14);
-                        int32_t sx = fx & 0xffff8000; // Floor
-                        int32_t fy = 0, sy = 0;
-                        int32_t temp1 = 0, temp2 = 0;
-
-                        fx -= sx;
-                        sx = sx >> 15;
-
-                        sx = sx < 0 ? 0 : sx;
-                        sx = sx > src_width ? src_width - 1: sx;
-
-                        fx = (fx +(1 << 4)) >> 5;
-
-                        // for y
-                        fy = (dy * yRatio + (yRatio >> 1)) - (1<< 14);
-                        sy = fy & 0xffff8000; // Floor
-                        fy -= sy;
-                        sy = sy >> 15;
-
-                        sy = sy < 0 ? 0 : sy;
-                        fy = fy < 0 ? 0 : fy;
-
-                        fy = (fy + (1<< 4)) >> 5;
-
-                        sx += xOffset;
-                        sy += yOffset;
-                        source_index = (sx + sy * src_width + dz * src_width * src_height + 0);
-
-                        /*C = ySrc[source_index] - 16;
-                        D = uSrc[subIdx] - 128;
-                        E = vSrc[subIdx] - 128;*/
-                        C = (int)buffer[0][source_index] - 16;
-                        D = (int)buffer[1][source_index] - 128;
-                        E = (int)buffer[2][source_index] - 128;
-
-                        rline1[0]            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        gline1[0]            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        bline1[0]            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        // right
-                        C = (int)buffer[0][source_index + 1] - 16;
-                        D = (int)buffer[1][source_index + 1] - 128;
-                        E = (int)buffer[2][source_index + 1] - 128;
-
-                        rline1[1]            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        gline1[1]            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        bline1[1]            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        // below
-                        C = (int)buffer[0][source_index + src_width] - 16;
-                        D = (int)buffer[1][source_index + src_width] - 128;
-                        E = (int)buffer[2][source_index + src_width] - 128;
-
-                        rline2[0]            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        gline2[0]            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        bline2[0]            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        // below right
-                        //C = ySrc[source_index + src_width + 1] - 16;
-                        C = (int)buffer[0][source_index + src_width + 1] - 16;
-                        D = (int)buffer[1][source_index + src_width + 1] - 128;
-                        E = (int)buffer[2][source_index + src_width + 1] - 128;
-
-                        rline2[1]            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        gline2[1]            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        bline2[1]            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        //B
-                        temp1 = fx * (bline1[1] - bline1[0]) + (bline1[0] << 10);
-                        temp2 = fx * (bline2[1] - bline2[0]) + (bline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        B = (uint8_t)(DESCALE(temp1));
-                        finalVal = (B - bMean) * var;
-                        buffer[3][dstB_idx] = finalVal;
-
-                        //G
-                        temp1 = fx * (gline1[1] - gline1[0]) + (gline1[0] << 10);
-                        temp2 = fx * (gline2[1] - gline2[0]) + (gline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-
-                        G = (uint8_t)(DESCALE(temp1));
-                        finalVal = (G - gMean) * var;
-                        buffer[3][dstG_idx] = finalVal;
-
-                        // R
-                        temp1 = fx * (rline1[1] - rline1[0]) + (rline1[0] << 10);
-                        temp2 = fx * (rline2[1] - rline2[0]) + (rline2[0] << 10);
-                        temp1 = fy * (temp2 - temp1) + (temp1 << 10);
-                        R = (uint8_t)(DESCALE(temp1));
-                        finalVal = (R - rMean) * var;
-                        buffer[3][dstR_idx] = finalVal;
-                    }
-                    else
-                    {
-                        // do conversion
-                        C = (int)buffer[0][source_index] - 16;
-                        D = (int)buffer[1][source_index] - 128;
-                        E = (int)buffer[2][source_index] - 128;
-
-                        R            = (uint8_t)vsi_clamp((298 * C + 409 * E + 128) >> 8, min, max);
-                        G            = (uint8_t)vsi_clamp((298 * C - 100* D - 208 * E + 128) >> 8, min, max);
-                        B            = (uint8_t)vsi_clamp((298 * C + 516 * D + 128) >> 8, min, max);
-
-                        buffer[3][dstB_idx] = (B - bMean) * var;
-                        buffer[3][dstG_idx] = (G - gMean) * var;
-                        buffer[3][dstR_idx] = (R - rMean) * var;
-                    }
-                }
-            }
-        }
-    }
-
-    if(trans)
-    {
-        vsi_size_t shape[] = {attr[3]->shape->data[0], attr[3]->shape->data[1], attr[3]->shape->data[2], 1};
-        vsi_size_t perm[] = {1, 2, 0, 3};
-        vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[3],
-                        shape, (uint32_t)attr[3]->shape->size, perm, VSI_NN_TYPE_FLOAT32);
-
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            outBuffer, out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-    else
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            buffer[3], out_elements );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    if(outBuffer)
-    {
-        free(outBuffer);
-    }
-
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _pre_process_yuv444_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _pre_process_yuv444_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 4;
-            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
-            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
-            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
-            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
-            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
-            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
-            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
-            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
-            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
-            int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-            vsi_nn_kernel_scalar_release( &backend_params[6] );
-            vsi_nn_kernel_scalar_release( &backend_params[7] );
-            vsi_nn_kernel_scalar_release( &backend_params[8] );
-            vsi_nn_kernel_scalar_release( &backend_params[9] );
-            vsi_nn_kernel_scalar_release( &backend_params[10] );
-            vsi_nn_kernel_scalar_release( &backend_params[11] );
-            vsi_nn_kernel_scalar_release( &backend_params[12] );
-            vsi_nn_kernel_scalar_release( &backend_params[13] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( pre_process_yuv444, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
deleted file mode 100644
index 7209c9a..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (0)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("prelu_sw")
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for( i = 0; i < rank && index; i ++ )
-    {
-        if( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-    return offset;
-}
-
-DEF_KERNEL_EXECUTOR(_prelu_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    vsi_size_t out_elements = 0;
-    vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-
-    vsi_nn_shape_get_stride( attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size, stride_size[0] );
-    vsi_nn_shape_get_stride( attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size, stride_size[1] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    for( i = 0; i < out_elements; i ++ )
-    {
-        vsi_ssize_t in0_offset = 0;
-        vsi_ssize_t in1_offset = 0;
-        float val1 = 0.f;
-        float val2 = 0.f;
-
-        in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size,
-                stride_size[0], attr[2]->shape->data );
-        in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size,
-                stride_size[1], attr[2]->shape->data );
-
-        val1 = buffer[0][in0_offset];
-        val2 = buffer[1][in1_offset];
-
-
-        buffer[2][i] = val1 >= 0 ? val1 : val1 * val2;
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _prelu_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _prelu_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t is_per_channel_alpha = 0;
-
-    is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha");
-
-    if (is_per_channel_alpha)
-    {
-        return NULL;
-    }
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( prelu, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
deleted file mode 100644
index 3bd40d6..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <float.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (0)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("random_multinomial_sw")
-
-/*
- * Kernel params
- */
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _PARAM_NUM  _cnt_of_array( kernel_param_def )
-
-/*
- * Kernel function
- */
-static int upper_bound(float* a, int n, float x) {
-    int l = 0;
-    int h = n;
-    while (l < h) {
-        int mid = (l + h) / 2;
-        if (x >= a[mid]) {
-            l = mid + 1;
-        } else {
-            h = mid;
-        }
-    }
-    return l;
-} /* upper_bound() */
-
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    vsi_size_t out_elements = 0;
-    vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t *random_integer = NULL;
-    float *random_float = NULL;
-    float *cdf = NULL;
-    uint32_t i = 0;
-    uint32_t n = 0;
-    uint32_t batch = 0;
-    uint32_t class_size = 0;
-    int32_t sample_num = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    sample_num = (int32_t)attr[2]->shape->data[0];
-    batch = (int32_t)attr[0]->shape->data[1];
-    class_size = (int32_t)attr[0]->shape->data[0];
-
-    vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] );
-    vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    random_integer = (uint32_t *)malloc(out_elements * sizeof(uint32_t));
-    CHECK_PTR_FAIL_GOTO( random_integer, "Create buffer fail.", final );
-    random_float = (float *)malloc(out_elements * sizeof(float));
-    CHECK_PTR_FAIL_GOTO( random_float, "Create buffer fail.", final );
-    cdf = (float *)malloc(class_size * sizeof(float));
-    CHECK_PTR_FAIL_GOTO( cdf, "Create buffer fail.", final );
-
-    vsi_nn_random_init_for_philox_4x32_10((uint32_t)(buffer[1][0]),
-        (uint32_t)(buffer[1][1]));
-    vsi_nn_random_generate_by_philox_4x32_10(random_integer, (uint32_t)out_elements);
-    vsi_nn_random_uniform_transform(random_integer,
-            random_float, (uint32_t)out_elements);
-
-    for (n = 0; n < batch; n++)
-    {
-        uint32_t c = 0;
-        float batch_max = -FLT_MAX;
-        float total = 0;
-        for(c = 0; c < class_size; c++)
-        {
-            uint32_t index = n * class_size + c;
-            batch_max = vsi_nn_max(batch_max, buffer[0][index]);
-        }
-
-        for(c = 0; c < class_size; c++)
-        {
-            uint32_t index = n * class_size + c;
-            total += (float)(exp(buffer[0][index] - batch_max));
-            cdf[c] = total;
-        }
-
-        for(c = 0; c < (uint32_t)sample_num; c++)
-        {
-            uint32_t index = n * sample_num + c;
-            float target = random_float[index] * total;
-            uint32_t out_class = upper_bound(cdf, class_size, target);
-            buffer[2][index] = (float)out_class;
-        }
-    }
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-
-    if (cdf)
-    {
-        free(cdf);
-        cdf = NULL;
-    }
-    if (random_integer)
-    {
-        free(random_integer);
-        random_integer = NULL;
-    }
-    if (random_float)
-    {
-        free(random_float);
-        random_float = NULL;
-    }
-
-    return status;
-} /* _compute() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _PARAM_NUM );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( random_multinomial, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c
deleted file mode 100644
index 5999b8c..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.reduceall_internal")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _reduceall_internal_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _REDUCEALL_INTERNAL_PARAM_NUM  _cnt_of_array( _reduceall_internal_kernel_param_def )
-
-#define SCALAR_INPUT_AXIS          (2)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-    int32_t  axis        = 0;
-    vsi_ssize_t  outerSize   = 1;
-    vsi_ssize_t  axisSize    = 1;
-    vsi_ssize_t  innerSize   = 1;
-    vsi_ssize_t  inner       = 0;
-    vsi_ssize_t  outer       = 0;
-    int32_t  all_result  = 0;
-
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for (i = 0; i < (uint32_t)axis; i++)
-    {
-        innerSize *= in_attr[0]->shape->data[i];
-    }
-
-    axisSize = in_attr[0]->shape->data[axis];
-
-    for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++)
-    {
-        outerSize *= in_attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            all_result = (!!(f32_in_buffer[0][outer * axisSize * innerSize + inner]));
-            for (i = 1; i < (uint32_t)axisSize; ++i)
-            {
-                int32_t value = (!!(f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner]));
-                all_result = all_result && value;
-            }
-            f32_out_buffer[0][outer * innerSize + inner] = (float)all_result;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _reduceall_internal_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _reduceall_internal_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_REDUCEALL_INTERNAL_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis   = vsi_nn_kernel_param_get_int32(params, "axis");
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _REDUCEALL_INTERNAL_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEALL_INTERNAL_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( reduceall_internal, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c
deleted file mode 100644
index 39a2ff4..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.reduceany_internal")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _reduceany_internal_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _REDUCEANY_INTERNAL_PARAM_NUM  _cnt_of_array( _reduceany_internal_kernel_param_def )
-
-#define SCALAR_INPUT_AXIS          (2)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-    int32_t  axis        = 0;
-    vsi_ssize_t  outerSize   = 1;
-    vsi_ssize_t  axisSize    = 1;
-    vsi_ssize_t  innerSize   = 1;
-    vsi_ssize_t  inner       = 0;
-    vsi_ssize_t  outer       = 0;
-    int32_t  any_result  = 0;
-
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for (i = 0; i < (uint32_t)axis; i++)
-    {
-        innerSize *= in_attr[0]->shape->data[i];
-    }
-
-    axisSize = in_attr[0]->shape->data[axis];
-
-    for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++)
-    {
-        outerSize *= in_attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            any_result = (!!(f32_in_buffer[0][outer * axisSize * innerSize + inner]));
-            for (i = 1; i < (uint32_t)axisSize; ++i)
-            {
-                int32_t value = (!!(f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner]));
-                any_result = any_result || value;
-            }
-            f32_out_buffer[0][outer * innerSize + inner] = (float)any_result;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _reduceany_internal_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _reduceany_internal_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_REDUCEANY_INTERNAL_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis   = vsi_nn_kernel_param_get_int32(params, "axis");
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _REDUCEANY_INTERNAL_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEANY_INTERNAL_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( reduceany_internal, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c
deleted file mode 100644
index c1f688c..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.reducemax_internal")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _reducemax_internal_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-#define _REDUCEMAX_INTERNAL_PARAM_NUM  _cnt_of_array( _reducemax_internal_kernel_param_def )
-
-#define SCALAR_INPUT_AXIS          (2)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-    int32_t  axis      = 0;
-    vsi_ssize_t  outerSize = 1;
-    vsi_ssize_t  axisSize  = 1;
-    vsi_ssize_t  innerSize = 1;
-    vsi_ssize_t  inner     = 0;
-    vsi_ssize_t  outer     = 0;
-    float    maxValue  = 0.0f;
-
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for (i = 0; i < (uint32_t)axis; i++)
-    {
-        innerSize *= in_attr[0]->shape->data[i];
-    }
-
-    axisSize = in_attr[0]->shape->data[axis];
-
-    for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++)
-    {
-        outerSize *= in_attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            maxValue = f32_in_buffer[0][outer * axisSize * innerSize + inner];
-            for (i = 1; i < (uint32_t)axisSize; ++i)
-            {
-                float value = f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner];
-                maxValue = vsi_nn_max(maxValue, value);
-            }
-            f32_out_buffer[0][outer * innerSize + inner] = (float)maxValue;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _reducemax_internal_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _reducemax_internal_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_REDUCEMAX_INTERNAL_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis = vsi_nn_kernel_param_get_int32(params, "axis");
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _REDUCEMAX_INTERNAL_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEMAX_INTERNAL_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( reducemax_internal, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c
deleted file mode 100644
index 3151853..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.reducemin_internal")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _reducemin_internal_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
-};
-#define _REDUCEMIN_INTERNAL_PARAM_NUM  _cnt_of_array( _reducemin_internal_kernel_param_def )
-
-#define SCALAR_INPUT_AXIS          (2)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-    int32_t  axis      = 0;
-    vsi_ssize_t  outerSize = 1;
-    vsi_ssize_t  axisSize  = 1;
-    vsi_ssize_t  innerSize = 1;
-    vsi_ssize_t  inner     = 0;
-    vsi_ssize_t  outer     = 0;
-    float    minValue  = 0.0f;
-
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for (i = 0; i < (uint32_t)axis; i++)
-    {
-        innerSize *= in_attr[0]->shape->data[i];
-    }
-
-    axisSize = in_attr[0]->shape->data[axis];
-
-    for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++)
-    {
-        outerSize *= in_attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            minValue = f32_in_buffer[0][outer * axisSize * innerSize + inner];
-            for (i = 1; i < (uint32_t)axisSize; ++i)
-            {
-                float value = f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner];
-                minValue = vsi_nn_min(minValue, value);
-            }
-            f32_out_buffer[0][outer * innerSize + inner] = (float)minValue;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _reducemin_internal_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _reducemin_internal_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_REDUCEMIN_INTERNAL_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis = vsi_nn_kernel_param_get_int32(params, "axis");
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _REDUCEMIN_INTERNAL_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEMIN_INTERNAL_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( reducemin_internal, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c
deleted file mode 100644
index 64b87c8..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.reduceprod_internal")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _reduceprod_internal_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _REDUCEPROD_INTERNAL_PARAM_NUM  _cnt_of_array( _reduceprod_internal_kernel_param_def )
-
-#define SCALAR_INPUT_AXIS          (2)
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-    int32_t  axis      = 0;
-    vsi_ssize_t  outerSize = 1;
-    vsi_ssize_t  axisSize  = 1;
-    vsi_ssize_t  innerSize = 1;
-    vsi_ssize_t  inner     = 0;
-    vsi_ssize_t  outer     = 0;
-    float    prodValue  = 0.0f;
-
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_AXIS], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for (i = 0; i < (uint32_t)axis; i++)
-    {
-        innerSize *= in_attr[0]->shape->data[i];
-    }
-
-    axisSize = in_attr[0]->shape->data[axis];
-
-    for (i = (uint32_t)axis + 1; i < in_attr[0]->shape->size; i++)
-    {
-        outerSize *= in_attr[0]->shape->data[i];
-    }
-
-    for ( outer = 0; outer < outerSize; ++outer)
-    {
-        for ( inner = 0; inner < innerSize; ++inner)
-        {
-            prodValue = f32_in_buffer[0][outer * axisSize * innerSize + inner];
-            for (i = 1; i < (uint32_t)axisSize; ++i)
-            {
-                float value = f32_in_buffer[0][(outer * axisSize + i) * innerSize + inner];
-                prodValue = prodValue * value;
-            }
-            f32_out_buffer[0][outer * innerSize + inner] = (float)prodValue;
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _reduceprod_internal_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _reduceprod_internal_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_REDUCEPROD_INTERNAL_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = 0;
-
-    axis = vsi_nn_kernel_param_get_int32(params, "axis");
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _REDUCEPROD_INTERNAL_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _REDUCEPROD_INTERNAL_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( reduceprod_internal, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c b/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c
deleted file mode 100644
index 3c4630d..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.relu_keras")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _relu_keras_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _RELU_KERAS_PARAM_NUM  _cnt_of_array( _relu_keras_kernel_param_def )
-
-#define SCALAR_ALPHA              (2)
-#define SCALAR_MAX_VALUE          (3)
-#define SCALAR_THRESHOLD          (4)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-    float     alpha     = 0.0f;
-    float     max_value = 0.0f;
-    float     threshold = 0.0f;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_ALPHA], &(alpha));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MAX_VALUE], &(max_value));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_THRESHOLD], &(threshold));
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        float data = f32_in_buffer[0][i];
-
-        data = data >= max_value ? max_value : data;
-        data = data < threshold ? alpha * (data - threshold) : data;
-        f32_out_buffer[0][i] = data;
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _relu_keras_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _relu_keras_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_RELU_KERAS_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float     alpha      = vsi_nn_kernel_param_get_float32( params, "alpha" );
-    float     max_value  = vsi_nn_kernel_param_get_float32( params, "max_value" );
-    float     threshold  = vsi_nn_kernel_param_get_float32( params, "threshold" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _RELU_KERAS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_ALPHA]     = vsi_nn_kernel_scalar_create( graph, F32, &alpha );
-            node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value );
-            node_params[SCALAR_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &threshold );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RELU_KERAS_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALPHA] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_THRESHOLD] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( relu_keras, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
deleted file mode 100644
index 3021604..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.repeat")
-
-DEF_KERNEL_EXECUTOR(_repeat_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_ssize_t i = 0, j = 0, b = 0, c = 0;
-    int32_t axis = 0;
-    vsi_ssize_t outerSize = 1;
-    vsi_ssize_t outIdx = 0;
-    vsi_ssize_t width = 0, height = 0, channel = 0, batch = 0;
-    vsi_ssize_t spatial = 0, vol = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input0 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memset( buffer[2], 0, out_elements * sizeof(float) );
-
-    width   = attr[0]->shape->data[0];
-    height  = attr[0]->shape->data[1];
-    channel = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
-    batch   = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
-    spatial = width * height;
-    vol     = spatial * channel;
-
-    for(i = 1; i < (int32_t)attr[0]->shape->size; i++)
-    {
-        outerSize *= attr[0]->shape->data[i];
-    }
-
-    if (axis == 0 && outerSize == 1)
-    {
-        for(i = 0; i < width; i++)
-        {
-            float data = buffer[0][i];
-            int32_t len = (int32_t)buffer[1][i];
-            for(j = 0; j < len; j++)
-            {
-                buffer[2][outIdx] = data;
-            }
-        }
-    }
-    else if (axis == 0)
-    {
-        for(b = 0; b < batch; b++)
-        {
-            for(c = 0; c < channel; c++)
-            {
-                for(i = 0; i < height; i++)
-                {
-                    vsi_ssize_t len = (int32_t)buffer[1][i];
-                    vsi_ssize_t offset = i * width + c * spatial + b * vol;
-                    for(j = 0; j < len; j++)
-                    {
-                        memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * width);
-                        outIdx += width;
-                    }
-                }
-            }
-        }
-    }
-    else if (axis == 1)
-    {
-        for(b = 0; b < batch; b++)
-        {
-            for(c = 0; c < channel; c++)
-            {
-                for(i = 0; i < height; i++)
-                {
-                    vsi_ssize_t offset = i * width + c * spatial + b * vol;
-                    for(j = 0; j < width; j++)
-                    {
-                        vsi_ssize_t len = (vsi_ssize_t)buffer[1][j];
-                        float data = buffer[0][offset + j];
-                        vsi_ssize_t k = 0;
-                        for(k = 0; k < len; k++)
-                        {
-                            buffer[2][outIdx++] = data;
-                        }
-                    }
-                }
-            }
-        }
-    }
-    else if (axis == 2)
-    {
-        for(b = 0; b < batch; b++)
-        {
-            for(c = 0; c < channel; c++)
-            {
-                vsi_ssize_t len = (vsi_ssize_t)buffer[1][c];
-                vsi_ssize_t offset = c * spatial + b * vol;
-
-                for(j = 0; j < len; j++)
-                {
-                    memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * spatial);
-                    outIdx += spatial;
-                }
-            }
-        }
-    }
-    else
-    {
-        VSILOGE("axis is not support");
-        status = VSI_FAILURE;
-        goto final;
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _repeat_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _repeat_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _REPEAT_PARAM_NUM  _cnt_of_array( _repeat_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _repeat_exec;
-    kernel->info.parameters  = _repeat_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _repeat_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis  = vsi_nn_kernel_param_get_int32( params, "axis" );
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
-
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( repeat, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c
deleted file mode 100644
index ed1eff5..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.resize_1d_bilinear")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _resize_1d_bilinear_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _RESIZE_1D_BILINEAR_PARAM_NUM  _cnt_of_array( _resize_1d_bilinear_kernel_param_def )
-
-#define SCALAR_ALIGN_CORNERS         (2)
-#define SCALAR_HALF_PIXEL            (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i = 0;
-    int32_t  align_corners = 0;
-    int32_t  half_pixel_centers = 0;
-    float    width_scale = 1.0f;
-    vsi_size_t input_width = 0, output_width = 0;
-    uint32_t w = 0, out = 0;
-    vsi_size_t output_dims = 0;
-    float    data00 = .0f, data01 = .0f, interpolation = .0f;
-    vsi_size_t index = 0;
-    vsi_size_t outer = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers));
-    input_width       = in_attr[0]->shape->data[0];
-    output_width      = out_attr[0]->shape->data[0];
-    output_dims       = (vsi_size_t)out_attr[0]->shape->size;
-
-    if (align_corners && output_width > 1)
-    {
-        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1);
-    }
-    else
-    {
-        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width;
-    }
-
-    outer = 1;
-
-    for (i = 1; i < output_dims; i++)
-    {
-        outer = outer * out_attr[0]->shape->data[i];
-    }
-
-    for (out = 0; out < outer; out++)
-    {
-        vsi_ssize_t input_base = out * input_width;
-        vsi_ssize_t output_base = out * output_width;
-        for (w = 0; w < output_width; w ++)
-        {
-            vx_float32 input_w;
-            vsi_ssize_t w0;
-            vsi_ssize_t w1;
-            if (half_pixel_centers)
-            {
-                input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f;
-            }
-            else
-            {
-                input_w = w * width_scale;
-            }
-            w0 = (vsi_ssize_t)input_w;
-            w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vsi_ssize_t)(input_width - 1));
-            index = input_base + w0;
-            data00 = f32_in_buffer[0][index];
-            index = input_base + w1;
-            data01 = f32_in_buffer[0][index];
-
-            interpolation = data00 * (1 - (input_w - w0)) +
-                            data01 * (input_w - w0);
-            index = output_base + w;
-            f32_out_buffer[0][index] = interpolation;
-        }
-    }
-
-
-    /* save data */
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _resize_1d_bilinear_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _resize_1d_bilinear_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_BILINEAR_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
-    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_BILINEAR_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
-            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_1D_BILINEAR_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( resize_1d_bilinear, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c
deleted file mode 100644
index 195353d..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.resize_1d_nearest")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _resize_1d_nearest_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _RESIZE_1D_NEAREST_PARAM_NUM  _cnt_of_array( _resize_1d_nearest_kernel_param_def )
-
-#define SCALAR_ALIGN_CORNERS         (2)
-#define SCALAR_HALF_PIXEL            (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i = 0;
-    int32_t  align_corners = 0;
-    int32_t  half_pixel_centers = 0;
-    float    width_scale = 1.0f;
-    vsi_size_t input_width = 0, output_width = 0;
-    vsi_size_t w = 0, out = 0;
-    vsi_size_t output_dims = 0;
-    vsi_size_t outer = 0;
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers));
-    input_width       = in_attr[0]->shape->data[0];
-    output_width      = out_attr[0]->shape->data[0];
-    output_dims       = (uint32_t)out_attr[0]->shape->size;
-
-    if (align_corners && output_width > 1)
-    {
-        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1);
-    }
-    else
-    {
-        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width;
-    }
-
-    outer = 1;
-
-    for (i = 1; i < output_dims; i++)
-    {
-        outer = outer * out_attr[0]->shape->data[i];
-    }
-
-    for (out = 0; out < outer; out++)
-    {
-        vsi_ssize_t input_base  = out * input_width;
-        vsi_ssize_t output_base = out * output_width;
-
-        for (w = 0; w < output_width; w ++)
-        {
-            float      input_w;
-            vsi_size_t   in_x;
-            vsi_ssize_t    in_index;
-            vsi_ssize_t    out_index;
-
-            if (half_pixel_centers)
-            {
-                input_w = ((float)w + 0.5f) * width_scale;
-            }
-            else
-            {
-                input_w = w * width_scale;
-            }
-            if (align_corners)
-            {
-                in_x = vsi_nn_min((vsi_size_t)simple_round(input_w), input_width - 1);
-            }
-            else
-            {
-                in_x = vsi_nn_min((vsi_size_t)floorf(input_w), input_width - 1);
-            }
-            in_index    = in_x + input_base;
-            out_index   = w + output_base;
-            f32_out_buffer[0][out_index] = f32_in_buffer[0][in_index];
-        }
-    }
-
-    /* save data */
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _resize_1d_nearest_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _resize_1d_nearest_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_RESIZE_1D_NEAREST_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
-    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_1D_NEAREST_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
-            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_1D_NEAREST_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( resize_1d_nearest, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
deleted file mode 100644
index 6b7a3d9..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c
+++ /dev/null
@@ -1,311 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.resize_bilinear")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _resize_bilinear_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _RESIZE_BILINEAR_PARAM_NUM  _cnt_of_array( _resize_bilinear_kernel_param_def )
-
-#define SCALAR_ALIGN_CORNERS         (2)
-#define SCALAR_HALF_PIXEL            (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-    int32_t  align_corners;
-    int32_t  half_pixel_centers;
-    float    width_scale;
-    float    height_scale;
-    vsi_size_t input_width, output_width, input_height, output_height;
-    vsi_size_t b = 0, d = 0, w = 0, h = 0;
-    vsi_size_t output_depth, input_depth;
-    vsi_size_t output_batch;
-    vsi_size_t output_dims, input_dims;
-    float    data00 = .0f, data01 = .0f, data10 = .0f, data11 = .0f, interpolation = .0f;
-    vsi_size_t input_width_orig;
-    vsi_size_t output_width_orig;
-    vsi_size_t index;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers));
-    input_width       = in_attr[0]->shape->data[0];
-    input_height      = in_attr[0]->shape->data[1];
-    output_width      = out_attr[0]->shape->data[0];
-    output_height     = out_attr[0]->shape->data[1];
-    output_dims       = (vsi_size_t)out_attr[0]->shape->size;
-    output_depth      = output_dims > 2 ? out_attr[0]->shape->data[2] : 1;
-    output_batch      = output_dims > 3 ? out_attr[0]->shape->data[3] : 1;
-    input_dims        = (vsi_size_t)in_attr[0]->shape->size;
-    input_depth       = input_dims > 2 ? in_attr[0]->shape->data[2] : 1;
-    input_width_orig  = input_width;
-    output_width_orig = output_width;
-
-    if (align_corners && output_width > 1)
-    {
-        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1);
-    }
-    else
-    {
-        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width;
-    }
-
-    if (align_corners && output_height > 1)
-    {
-        height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(output_height - 1);
-    }
-    else
-    {
-        height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)output_height;
-    }
-
-    for (b = 0; b < output_batch; b ++)
-    {
-        for (d = 0; d < output_depth; d ++)
-        {
-            vsi_ssize_t input_base = b * input_depth * input_width_orig * input_height \
-            + d * input_width_orig * input_height;
-            vsi_ssize_t output_base = b * output_depth * output_width_orig * output_height \
-            + d * output_width_orig * output_height;
-
-            for (h = 0; h < output_height; h ++)
-            {
-                vx_float32 input_h = h * height_scale;
-                vsi_size_t h0;
-                vsi_size_t h1;
-
-                if (half_pixel_centers)
-                {
-                    input_h = ((vx_float32)h + 0.5f) * height_scale - 0.5f;
-                }
-                else
-                {
-                    input_h = h * height_scale;
-                }
-                h0 = (vsi_size_t)input_h;
-                h1 = input_h < 0 ? 0 : vsi_nn_min(h0 + 1, input_height - 1);
-                for (w = 0; w < output_width; w ++)
-                {
-                    vx_float32 input_w;
-                    vsi_ssize_t w0;
-                    vsi_ssize_t w1;
-                    if (half_pixel_centers)
-                    {
-                        input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f;
-                    }
-                    else
-                    {
-                        input_w = w * width_scale;
-                    }
-                    w0 = (vsi_ssize_t)input_w;
-                    w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vsi_ssize_t)(input_width - 1));
-                    index = input_base + h0 * input_width_orig + w0;
-                    data00 = f32_in_buffer[0][index];
-                    index = input_base + h0 * input_width_orig + w1;
-                    data01 = f32_in_buffer[0][index];
-                    index = input_base + h1 * input_width_orig + w0;
-                    data10 = f32_in_buffer[0][index];
-                    index = input_base + h1 * input_width_orig + w1;
-                    data11 = f32_in_buffer[0][index];
-
-                    interpolation = data00 * (1 - (input_h - h0)) * (1 - (input_w - w0)) +
-                                    data10 * (input_h - h0) * (1 - (input_w - w0)) +
-                                    data01 * (1 - (input_h - h0)) * (input_w - w0) +
-                                    data11 * (input_h - h0) * (input_w - w0);
-                    index = output_base + h * output_width_orig + w;
-                    f32_out_buffer[0][index] = interpolation;
-                }
-            }
-        }
-    }
-
-
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _resize_bilinear_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _resize_bilinear_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
-    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
-            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( resize_bilinear, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c
deleted file mode 100644
index 61690c3..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2021 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.resize_bilinear_nhwc")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _RESIZE_BILINEAR_NHWC_PARAM_NUM  _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def )
-
-#define SCALAR_ALIGN_CORNERS         (2)
-#define SCALAR_HALF_PIXEL            (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-    int32_t  align_corners;
-    int32_t  half_pixel_centers;
-    float    width_scale;
-    float    height_scale;
-    vsi_size_t input_width, output_width, input_height, output_height;
-    vsi_size_t b = 0, d = 0, w = 0, h = 0;
-    vsi_size_t output_depth, input_depth;
-    vsi_size_t output_batch;
-    vsi_size_t output_dims;
-    float    data00 = .0f, data01 = .0f, data10 = .0f, data11 = .0f, interpolation = .0f;
-    vsi_size_t input_width_orig;
-    vsi_size_t output_width_orig;
-    vsi_size_t index;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers));
-    input_width       = in_attr[0]->shape->data[1];
-    input_height      = in_attr[0]->shape->data[2];
-    output_width      = out_attr[0]->shape->data[1];
-    output_height     = out_attr[0]->shape->data[2];
-    output_dims       = (vsi_size_t)out_attr[0]->shape->size;
-    output_depth      = out_attr[0]->shape->data[0];
-    output_batch      = output_dims > 3 ? out_attr[0]->shape->data[3] : 1;
-    input_depth       = in_attr[0]->shape->data[0];
-    input_width_orig  = input_width;
-    output_width_orig = output_width;
-
-    if (align_corners && output_width > 1)
-    {
-        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1);
-    }
-    else
-    {
-        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width;
-    }
-
-    if (align_corners && output_height > 1)
-    {
-        height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(output_height - 1);
-    }
-    else
-    {
-        height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)output_height;
-    }
-
-    for (b = 0; b < output_batch; b ++)
-    {
-        vsi_ssize_t input_base = b * input_depth * input_width_orig * input_height;
-        vsi_ssize_t output_base = b * output_depth * output_width_orig * output_height;
-
-        for (h = 0; h < output_height; h++)
-        {
-            vx_float32 input_h = h * height_scale;
-            vsi_size_t h0;
-            vsi_size_t h1;
-
-            if (half_pixel_centers)
-            {
-                input_h = ((vx_float32)h + 0.5f) * height_scale - 0.5f;
-            }
-            else
-            {
-                input_h = h * height_scale;
-            }
-            h0 = (vsi_size_t)input_h;
-            h1 = input_h < 0 ? 0 : vsi_nn_min(h0 + 1, input_height - 1);
-            for (w = 0; w < output_width; w++)
-            {
-                vx_float32 input_w;
-                vsi_ssize_t w0;
-                vsi_ssize_t w1;
-                if (half_pixel_centers)
-                {
-                    input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f;
-                }
-                else
-                {
-                    input_w = w * width_scale;
-                }
-                w0 = (vsi_ssize_t)input_w;
-                w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vsi_ssize_t)(input_width - 1));
-
-                for (d = 0; d < output_depth; d++)
-                {
-                    index = input_base + h0 * input_width_orig * input_depth + w0 * input_depth + d;
-                    data00 = f32_in_buffer[0][index];
-                    index = input_base + h0 * input_width_orig * input_depth + w1 * input_depth + d;
-                    data01 = f32_in_buffer[0][index];
-                    index = input_base + h1 * input_width_orig * input_depth + w0 * input_depth + d;
-                    data10 = f32_in_buffer[0][index];
-                    index = input_base + h1 * input_width_orig * input_depth + w1 * input_depth + d;
-                    data11 = f32_in_buffer[0][index];
-
-                    interpolation = data00 * (1 - (input_h - h0)) * (1 - (input_w - w0)) +
-                        data10 * (input_h - h0) * (1 - (input_w - w0)) +
-                        data01 * (1 - (input_h - h0)) * (input_w - w0) +
-                        data11 * (input_h - h0) * (input_w - w0);
-                    index = output_base + h * output_width_orig * output_depth + w * output_depth + d;
-                    f32_out_buffer[0][index] = interpolation;
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _resize_bilinear_nhwc_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
-    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
-            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( resize_bilinear_nhwc, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c
deleted file mode 100644
index d74f6cb..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.resize_nearest")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _resize_nearest_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _RESIZE_NEAREST_PARAM_NUM  _cnt_of_array( _resize_nearest_kernel_param_def )
-
-#define SCALAR_ALIGN_CORNERS         (2)
-#define SCALAR_HALF_PIXEL            (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t i;
-    int32_t  align_corners;
-    int32_t  half_pixel_centers;
-    float    width_scale;
-    float    height_scale;
-    vsi_size_t input_width, output_width, input_height, output_height;
-    vsi_size_t b = 0, d = 0, w = 0, h = 0;
-    vsi_size_t output_depth, input_depth;
-    vsi_size_t output_batch;
-    vsi_size_t output_dims, input_dims;
-    vsi_size_t input_width_orig;
-    vsi_size_t output_width_orig;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers));
-    input_width       = in_attr[0]->shape->data[0];
-    input_height      = in_attr[0]->shape->data[1];
-    output_width      = out_attr[0]->shape->data[0];
-    output_height     = out_attr[0]->shape->data[1];
-    output_dims       = (vsi_size_t)out_attr[0]->shape->size;
-    output_depth      = output_dims > 2 ? out_attr[0]->shape->data[2] : 1;
-    output_batch      = output_dims > 3 ? out_attr[0]->shape->data[3] : 1;
-    input_dims        = (vsi_size_t)in_attr[0]->shape->size;
-    input_depth       = input_dims > 2 ? in_attr[0]->shape->data[2] : 1;
-    input_width_orig  = input_width;
-    output_width_orig = output_width;
-
-    if (align_corners && output_width > 1)
-    {
-        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1);
-    }
-    else
-    {
-        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width;
-    }
-
-    if (align_corners && output_height > 1)
-    {
-        height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(output_height - 1);
-    }
-    else
-    {
-        height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)output_height;
-    }
-
-    for (b = 0; b < output_batch; b ++)
-    {
-        for (d = 0; d < output_depth; d ++)
-        {
-            vsi_ssize_t input_base = b * input_depth * input_width_orig * input_height \
-            + d * input_width_orig * input_height;
-            vsi_ssize_t output_base = b * output_depth * output_width_orig * output_height \
-            + d * output_width_orig * output_height;
-
-            for (h = 0; h < output_height; h ++)
-            {
-                float     input_h;
-                vsi_size_t  in_y;
-
-                if (half_pixel_centers)
-                {
-                    input_h = ((float)h + 0.5f) * height_scale;
-                }
-                else
-                {
-                    input_h = h * height_scale;
-                }
-                if (align_corners)
-                {
-                    in_y = vsi_nn_min((vsi_size_t)simple_round(input_h), input_height - 1);
-                }
-                else
-                {
-                    in_y = vsi_nn_min((vsi_size_t)floorf(input_h), input_height - 1);
-                }
-
-                for (w = 0; w < output_width; w ++)
-                {
-                    float      input_w;
-                    vsi_size_t   in_x;
-                    vsi_ssize_t    in_index;
-                    vsi_ssize_t    out_index;
-
-                    if (half_pixel_centers)
-                    {
-                        input_w = ((float)w + 0.5f) * width_scale;
-                    }
-                    else
-                    {
-                        input_w = w * width_scale;
-                    }
-                    if (align_corners)
-                    {
-                        in_x = vsi_nn_min((vsi_size_t)simple_round(input_w), input_width - 1);
-                    }
-                    else
-                    {
-                        in_x = vsi_nn_min((vsi_size_t)floorf(input_w), input_width - 1);
-                    }
-                    in_index    = in_x + in_y * input_width_orig + input_base;
-                    out_index   = w + h * output_width_orig + output_base;
-                    f32_out_buffer[0][out_index] = f32_in_buffer[0][in_index];
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _resize_nearest_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _resize_nearest_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_RESIZE_NEAREST_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node   = NULL;
-    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
-    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_NEAREST_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
-            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_NEAREST_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( resize_nearest, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
deleted file mode 100644
index 071e5e7..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
+++ /dev/null
@@ -1,384 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.roi_align")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _roi_align_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )
-#define SCALAR_X_RATIO          (4)
-#define SCALAR_Y_RATIO          (5)
-#define SCALAR_X_SAMPLE         (6)
-#define SCALAR_Y_SAMPLE         (7)
-
-/*
- * Kernel function
- */
-static float _compute_region_coordinate(int32_t p, float bin_size, float roi_anchor, float max_value)
-{
-    const float region_start = p * bin_size + roi_anchor;
-
-    return region_start;
-}
-
-static float _roi_align_1x1(float *input_ptr,
-                           int32_t width,
-                           int32_t height,
-                           float   region_start_x,
-                           float   bin_size_x,
-                           int32_t grid_size_x,
-                           float   region_end_x,
-                           float   region_start_y,
-                           float   bin_size_y,
-                           int32_t grid_size_y,
-                           float   region_end_y)
-{
-    float avg = 0;
-    int32_t iy = 0;
-    int32_t ix = 0;
-    // Iterate through the aligned pooling region
-    for (iy = 0; iy < grid_size_y; ++iy)
-    {
-        for (ix = 0; ix < grid_size_x; ++ix)
-        {
-            // Align the window in the middle of every bin
-            float y = region_start_y +
-                      ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y);
-            float x = region_start_x +
-                      ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x);
-
-            // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-            const int32_t y_low = vsi_nn_min((int32_t)y, height - 1);
-            const int32_t x_low = vsi_nn_min((int32_t)x, width - 1);
-            const int32_t y_high = vsi_nn_min(y_low + 1, height - 1);
-            const int32_t x_high = vsi_nn_min(x_low + 1, width - 1);
-
-            float ly = y - y_low;
-            float lx = x - x_low;
-            float hy = 1.0f - ly;
-            float hx = 1.0f - lx;
-
-            float w1 = hy * hx;
-            float w2 = hy * lx;
-            float w3 = ly * hx;
-            float w4 = ly * lx;
-
-            const float data1 = *(input_ptr + y_low * width + x_low);
-            const float data2 = *(input_ptr + y_low * width + x_high);
-            const float data3 = *(input_ptr + y_high * width + x_low);
-            const float data4 = *(input_ptr + y_high * width + x_high);
-
-            /* onnx: inverse elements are out of feature map boundary */
-            if (x > width || x < -1 || y > height || y < -1) continue;
-
-            x = x_low >= width - 1 ? x_low : x;
-            y = y_low >= height - 1 ? y_low : y;
-
-            ly = y - y_low;
-            lx = x - x_low;
-            hy = 1.0f - ly;
-            hx = 1.0f - lx;
-
-            w1 = hy * hx;
-            w2 = hy * lx;
-            w3 = ly * hx;
-            w4 = ly * lx;
-
-            avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-        }
-    }
-
-    avg /= grid_size_x * grid_size_y;
-
-    return avg;
-}
-
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i                 = 0;
-    float     width_scale       = 0.0f;
-    float     height_scale      = 0.0f;
-    float     width_ratio       = 0.0f;
-    float     height_ratio      = 0.0f;
-    int32_t   width_sample_num  = 0;
-    int32_t   height_sample_num = 0;
-    uint32_t  n                 = 0;
-    vsi_size_t  num_rois          = 0;
-    vsi_ssize_t   inHeight          = 0;
-    vsi_ssize_t   inWidth           = 0;
-    vsi_ssize_t   inDepth           = 0;
-    vsi_ssize_t   outHeight         = 0;
-    vsi_ssize_t   outWidth          = 0;
-    uint32_t  kRoiDim           = 4;
-    uint32_t  out_index         = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_X_RATIO], &(width_ratio));
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_RATIO], &(height_ratio));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_X_SAMPLE], &(width_sample_num));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_Y_SAMPLE], &(height_sample_num));
-
-    width_scale = 1.0f / width_ratio;
-    height_scale = 1.0f / height_ratio;
-    num_rois = in_attr[1]->shape->data[1];
-
-    inWidth = in_attr[0]->shape->data[0];
-    inHeight = in_attr[0]->shape->data[1];
-    inDepth = in_attr[0]->shape->data[2];
-    outWidth = out_attr[0]->shape->data[0];
-    outHeight = out_attr[0]->shape->data[1];
-
-    for (n = 0; n < num_rois; n++)
-    {
-        uint32_t batchId = (uint32_t)f32_in_buffer[2][n];
-        float qx1 = f32_in_buffer[1][n * kRoiDim];
-        float qy1 = f32_in_buffer[1][n * kRoiDim + 1];
-        float qx2 = f32_in_buffer[1][n * kRoiDim + 2];
-        float qy2 = f32_in_buffer[1][n * kRoiDim + 3];
-
-        float x1 = qx1;
-        float x2 = qx2;
-        float y1 = qy1;
-        float y2 = qy2;
-        float roi_anchor_x = x1 * width_scale;
-        float roi_anchor_y = y1 * height_scale;
-        float roi_dims_x   = vsi_nn_max((x2 - x1) * width_scale, 1.0f);
-        float roi_dims_y   = vsi_nn_max((y2 - y1) * height_scale, 1.0f);
-        float bin_size_x   = roi_dims_x / outWidth;
-        float bin_size_y   = roi_dims_y / outHeight;
-
-        vsi_ssize_t batch_base_index = batchId * inHeight * inWidth * inDepth;
-        int32_t ch = 0;
-        int32_t py = 0;
-        int32_t px = 0;
-
-        for (ch = 0; ch < inDepth; ch++)
-        {
-            for (py = 0; py < outHeight; py++)
-            {
-                for (px = 0; px < outWidth; px++)
-                {
-                    float region_start_x = _compute_region_coordinate(px, bin_size_x,
-                        roi_anchor_x, (float)inWidth);
-                    float region_start_y = _compute_region_coordinate(py, bin_size_y,
-                        roi_anchor_y, (float)inHeight);
-                    float region_end_x   = _compute_region_coordinate(px + 1, bin_size_x,
-                        roi_anchor_x, (float)inWidth);
-                    float region_end_y   = _compute_region_coordinate(py + 1, bin_size_y,
-                        roi_anchor_y, (float)inHeight);
-
-                    int32_t roi_bin_grid_x = (width_sample_num > 0) ? width_sample_num : (int32_t)(ceil(bin_size_x));
-                    int32_t roi_bin_grid_y = (height_sample_num > 0) ? height_sample_num : (int32_t)(ceil(bin_size_y));
-
-                    float *input_ptr = &f32_in_buffer[0][batch_base_index + ch * inWidth * inHeight];
-                    float out_val = 0;
-
-                    out_val = _roi_align_1x1(
-                        input_ptr, (int32_t)inWidth, (int32_t)inHeight, region_start_x, bin_size_x,
-                        roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                        roi_bin_grid_y, region_end_y);
-
-                    f32_out_buffer[0][out_index++] = out_val;
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _roi_align_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _roi_align_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_ROI_ALIGN_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    float   width_ratio         = vsi_nn_kernel_param_get_float32( params, "width_ratio" );
-    float   height_ratio        = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
-    int32_t width_sample_num    = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
-    int32_t height_sample_num   = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_X_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &width_ratio );
-            node_params[SCALAR_Y_RATIO] = vsi_nn_kernel_scalar_create( graph, F32, &height_ratio );
-            node_params[SCALAR_X_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &width_sample_num );
-            node_params[SCALAR_Y_SAMPLE] = vsi_nn_kernel_scalar_create( graph, I32, &height_sample_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ROI_ALIGN_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_RATIO] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_RATIO] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_X_SAMPLE] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_Y_SAMPLE] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( roi_align, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c
deleted file mode 100644
index b3cfbbc..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c
+++ /dev/null
@@ -1,258 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _ARG_NUM            (2)
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.gather_elements")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _scatter_elements_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _SCATTER_ELEMENTS_PARAM_NUM  _cnt_of_array( _scatter_elements_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[3] = { NULL };
-    int32_t* buffer_idx = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t a = 0;
-    vsi_size_t o = 0;
-    vsi_size_t i = 0;
-    vsi_size_t outer_size[2] = {1, 1};
-    vsi_size_t inner_size[2] = {1, 1};
-    vsi_size_t axis_size[2] = {1, 1};
-    int32_t axis = 0;
-    int32_t reduction = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
-    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &reduction);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer_idx = (int32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
-    CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input0 buffer fail.", final );
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memcpy( buffer[2], buffer[0], out_elements * sizeof(float) );
-
-    axis_size[0] = attr[0]->shape->data[axis];
-    axis_size[1] = attr[1]->shape->data[axis];
-    for (i = 0; i < (vsi_size_t)axis; ++i)
-    {
-        inner_size[0] *= attr[0]->shape->data[i];
-        inner_size[1] *= attr[1]->shape->data[i];
-    }
-
-    for (i = axis + 1; i < attr[1]->shape->size; ++i)
-    {
-        outer_size[0] *= attr[0]->shape->data[i];
-        outer_size[1] *= attr[1]->shape->data[i];
-    }
-
-    for (o = 0; o < outer_size[1]; o++)
-    {
-        for (a = 0; a < axis_size[1]; a++)
-        {
-            for (i = 0; i < inner_size[1]; i++)
-            {
-                vsi_ssize_t index = 0;
-                vsi_size_t index0 = (o * axis_size[1] + a) * inner_size[1] + i;
-                vsi_size_t index1 = 1;
-
-                index = (vsi_ssize_t)buffer_idx[index0];
-                index1 = (o * axis_size[0] + index) * inner_size[0] + i;
-
-                switch (reduction)
-                {
-                    case VSI_NN_REDUCTION_TYPE_NONE:
-                        buffer[2][index1] = buffer[1][index0];
-                        break;
-                    case VSI_NN_REDUCTION_TYPE_ADD:
-                        buffer[2][index1] += buffer[1][index0];
-                        break;
-                    case VSI_NN_REDUCTION_TYPE_MUL:
-                        buffer[2][index1] *= buffer[1][index0];
-                        break;
-                    default:
-                        break;
-                }
-
-
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-final:
-    if ( buffer_idx )
-    {
-        free( buffer_idx );
-    }
-    for ( i = 0; i < 3; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for ( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _scatter_elements_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _scatter_elements_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_SCATTER_ELEMENTS_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
-    int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" );
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _SCATTER_ELEMENTS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
-            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &reduction );
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ELEMENTS_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[4] );
-            vsi_nn_kernel_scalar_release( &node_params[5] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( scatter_elements, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c
deleted file mode 100644
index 030487a..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c
+++ /dev/null
@@ -1,247 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (3)
-#define _CPU_INPUT_NUM          (2)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.scatter_nd")
-
-DEF_KERNEL_EXECUTOR(_scatter_nd_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    uint32_t *   para_buffer[1] = { NULL };
-    float * buffer[2] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    int32_t i = 0, j = 0;
-    int32_t block_size = 1, indices_num = 1;
-    int32_t coord_dim = 1;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0]; // idx    int
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1]; // update
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2]; // output
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
-
-    para_buffer[0] = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE );
-    CHECK_PTR_FAIL_GOTO( para_buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input1 buffer fail.", final );
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &(block_size));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(coord_dim));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &(indices_num));
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    if(coord_dim <= 3)
-    {
-        vsi_ssize_t stride[3] = {0, 0, 0};
-        vsi_ssize_t new_shape[3] = {1, 1, 1};
-        vsi_ssize_t merge_dim = (vsi_ssize_t)attr[2]->shape->size - coord_dim + 1;
-
-        for(i = 0; i < merge_dim; ++i)
-        {
-            new_shape[0] *= attr[2]->shape->data[i];
-        }
-        stride[0] = new_shape[0] / block_size;
-
-        for(i = 1; i < coord_dim; ++i)
-        {
-            new_shape[i] = attr[2]->shape->data[merge_dim + i - 1];
-
-            stride[i] = stride[i - 1] * new_shape[i];
-        }
-
-        for(i = 0; i < indices_num; i++)
-        {
-            uint32_t in_index = i * block_size;
-            vsi_size_t out_index = 0;
-            uint32_t coord[3] = {0};
-            int32_t byd_flg = 0;
-
-            for(j = 0; j < coord_dim; j++)
-            {
-                coord[j] = para_buffer[0][i * coord_dim + coord_dim - j - 1];
-                if(coord[j] >= (uint32_t)new_shape[j])
-                {
-                    byd_flg = 1;
-                    break;
-                }
-            }
-            if(byd_flg)
-            {
-                continue;
-            }
-
-            out_index = (coord[2] * stride[1] + coord[1] * stride[0] + coord[0]) * block_size;
-            for(j = 0; j < block_size; j++)
-            {
-                buffer[1][out_index + j] += buffer[0][in_index + j];
-            }
-        }
-    }
-    else
-    {
-        status = VSI_FAILURE;
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if( para_buffer[0] )
-    {
-        free( para_buffer[0] );
-    }
-    for( i = 0; i < 2; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _scatter_nd_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _scatter_nd_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _scatter_nd_exec;
-    kernel->info.parameters  = _scatter_nd_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _scatter_nd_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
-    int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
-    int32_t idx_num  = vsi_nn_kernel_param_get_int32( params, "idx_num" );
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            uint32_t index = 3;
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( scatter_nd, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c
deleted file mode 100644
index 564e861..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c
+++ /dev/null
@@ -1,273 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (3)
-#define _CPU_INPUT_NUM          (3)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.scatter_nd_update")
-
-DEF_KERNEL_EXECUTOR(_scatter_nd_update_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    uint32_t *   para_buffer[1] = { NULL };
-    uint32_t *   mask = NULL;
-    float * buffer[3] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL };
-    int32_t i = 0, j = 0;
-    int32_t block_size = 1, indices_num = 1;
-    int32_t coord_dim = 1;
-    int32_t mask_len = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0]; // ref
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1]; // idx    int
-    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2]; // update
-    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3]; // output
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    para_buffer[0] = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
-    CHECK_PTR_FAIL_GOTO( para_buffer[0], "Create input1 buffer fail.", final );
-
-    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input2 buffer fail.", final );
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(block_size));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &(coord_dim));
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &(indices_num));
-
-    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
-    memcpy( buffer[2], buffer[0], out_elements * sizeof(float) );
-
-    mask_len = (int32_t)out_elements / block_size;
-    mask = (uint32_t *)malloc( mask_len * sizeof(uint32_t) );
-    memset(mask, 0, mask_len * sizeof(uint32_t));
-
-    if (coord_dim <= 5)
-    {
-        vsi_ssize_t stride[5] = {0, 0, 0, 0, 0};
-        vsi_ssize_t new_shape[5] = {1, 1, 1, 1, 1};
-        vsi_ssize_t merge_dim = (vsi_ssize_t)attr[3]->shape->size - coord_dim + 1;
-
-        for(i = 0; i < merge_dim; ++i)
-        {
-            new_shape[0] *= attr[3]->shape->data[i];
-        }
-        stride[0] = new_shape[0] / block_size;
-
-        for(i = 1; i < coord_dim; ++i)
-        {
-            new_shape[i] = attr[3]->shape->data[merge_dim + i - 1];
-
-            stride[i] = stride[i - 1] * new_shape[i];
-        }
-
-        for(i = 0; i < indices_num; i++)
-        {
-            uint32_t in_index = i * block_size;
-            vsi_size_t out_index = 0;
-            uint32_t coord[5] = {0};
-            int32_t byd_flg = 0;
-            vsi_ssize_t  mask_idx = 0;
-
-            for(j = 0; j < coord_dim; j++)
-            {
-                coord[j] = para_buffer[0][i * coord_dim + coord_dim - j - 1];
-                if (coord[j] >= (uint32_t)new_shape[j])
-                {
-                    byd_flg = 1;
-                    break;
-                }
-            }
-            if (byd_flg)
-            {
-                continue;
-            }
-
-            mask_idx = coord[4] * stride[3] + coord[3] * stride[2] +
-                            coord[2] * stride[1] + coord[1] * stride[0] + coord[0];
-            out_index = mask_idx * block_size;
-            if (mask[mask_idx] == 0)
-            {
-                memset(buffer[2] + out_index, 0, block_size * sizeof(float));
-                mask[mask_idx] = 1;
-            }
-            for(j = 0; j < block_size; j++)
-            {
-                buffer[2][out_index + j] += buffer[1][in_index + j];
-            }
-        }
-    }
-    else
-    {
-        status = VSI_FAILURE;
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
-            buffer[2], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if ( para_buffer[0] )
-    {
-        free( para_buffer[0] );
-    }
-
-    if (mask)
-    {
-        free(mask);
-    }
-    for( i = 0; i < 3; i ++ )
-    {
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    for( i = 0; i < 4; i ++ )
-    {
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-    }
-    return status;
-} /* _scatter_nd_update_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _scatter_nd_update_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _scatter_nd_update_exec;
-    kernel->info.parameters  = _scatter_nd_update_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _scatter_nd_update_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
-    int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
-    int32_t idx_num  = vsi_nn_kernel_param_get_int32( params, "idx_num" );
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            uint32_t index = 4;
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[4] );
-            vsi_nn_kernel_scalar_release( &backend_params[5] );
-            vsi_nn_kernel_scalar_release( &backend_params[6] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( scatter_nd_update, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/select_cpu.c b/src/tim/vx/internal/src/kernel/cpu/select_cpu.c
deleted file mode 100644
index b172325..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/select_cpu.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (3)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.select")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _select_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _SELECT_PARAM_NUM  _cnt_of_array( _select_kernel_param_def )
-
-static vsi_ssize_t _expand_offset
-    (
-    vsi_ssize_t index,
-    vsi_size_t * shape, vsi_size_t rank,
-    vsi_size_t * strides, vsi_size_t * out_shape
-    )
-{
-    vsi_size_t i;
-    vsi_ssize_t offset = 0;
-
-    for( i = 0; i < rank && index; i ++ )
-    {
-        if( shape[i] == out_shape[i] )
-        {
-            offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
-        }
-        index /= out_shape[i];
-    }
-    return offset;
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        vsi_ssize_t  in0_offset = 0;
-        vsi_ssize_t  in1_offset = 0;
-        vsi_ssize_t  in2_offset = 0;
-
-        in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size,
-                in_stride_size[0], out_attr[0]->shape->data );
-        in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size,
-                in_stride_size[1], out_attr[0]->shape->data );
-        in2_offset = _expand_offset( i, in_attr[2]->shape->data, (vsi_size_t)in_attr[2]->shape->size,
-                in_stride_size[2], out_attr[0]->shape->data );
-
-        f32_out_buffer[0][i] = (f32_in_buffer[0][in0_offset]) ?
-                               f32_in_buffer[1][in1_offset] : f32_in_buffer[2][in2_offset];
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _select_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _select_kernel_param_def );
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_SELECT_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs);
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SELECT_PARAM_NUM );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( select, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c b/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c
deleted file mode 100644
index 6844f4a..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (1)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("sequence_mask_sw")
-
-DEF_KERNEL_EXECUTOR(_sequence_mask_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer_in = NULL;
-    float * buffer = NULL;
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    buffer_in = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer_in, "Create input0 buffer fail.", final );
-
-    buffer = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer, "Create output buffer fail.", final );
-    memset( buffer, 0, out_elements * sizeof(float) );
-
-    {
-        vsi_size_t j = 0;
-        vsi_size_t height = attr[1]->shape->data[1];
-        vsi_size_t width = attr[1]->shape->data[0];
-
-        for(j = 0; j < height; j++)
-        {
-            vsi_size_t idx_in = (vsi_size_t)buffer_in[j];
-            vsi_size_t out_offset = j * width;
-            idx_in = idx_in > width ? width : idx_in;
-            for(i = 0; i < idx_in; i++)
-            {
-                buffer[out_offset + i] = 1;
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer, out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if (buffer_in)
-    {
-        free( buffer_in );
-    }
-    if (buffer)
-    {
-        free( buffer );
-    }
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-    return status;
-} /* _sequence_mask_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _sequence_mask_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static int32_t _optimize_mask_shape
-    (
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    int32_t max_len,
-    vsi_size_t* opt_shape_in,
-    vsi_size_t* opt_shape_out
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_size_t out_size = 1;
-    uint32_t i = 0;
-    opt_shape_in[0] = 1;
-    opt_shape_in[1] = 1;
-    for(i = 0; i < inputs[0]->attr.dim_num; i++)
-    {
-        opt_shape_in[0] *= inputs[0]->attr.size[i];
-    }
-
-    for(i = 0; i < outputs[0]->attr.dim_num; i++)
-    {
-        out_size *= outputs[0]->attr.size[i];
-    }
-
-    opt_shape_out[0] = max_len;
-    opt_shape_out[1] = out_size / max_len;
-
-    if (out_size % max_len != 0)
-    {
-        return VSI_FAILURE;
-    }
-
-    return status;
-}
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
-    vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }};
-    int32_t max_len  = vsi_nn_kernel_param_get_int32( params, "max_len" );
-
-    status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1]);
-    if ( VSI_SUCCESS != status )
-    {
-        goto final;
-    }
-    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2);
-    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 2);
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            uint32_t index = 0;
-            /* Pass parameters to node. */
-            backend_params[index++] = rs_input;
-            backend_params[index++] = rs_output;
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS(status);
-            vsi_nn_kernel_scalar_release( &backend_params[2] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-final:
-    if (rs_input)
-    {
-        vsi_nn_kernel_tensor_release( &rs_input );
-    }
-    if (rs_output)
-    {
-        vsi_nn_kernel_tensor_release( &rs_output );
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( sequence_mask, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c b/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c
deleted file mode 100644
index b4379d1..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c
+++ /dev/null
@@ -1,289 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.signal_frame")
-
-/*
- * Kernel params
- */
-static vx_param_description_t _signal_frame_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _SIGNAL_FRAME_PARAM_NUM  _cnt_of_array( _signal_frame_kernel_param_def )
-#define FRAME_LENGHT    (2)
-#define FRAME_STEP      (3)
-#define AXIS            (4)
-#define PAD_END         (5)
-#define PAD_VAL         (6)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    int32_t i = 0;
-    int32_t j = 0;
-    int32_t k = 0;
-    int32_t frame_length = 0;
-    int32_t frame_step = 0;
-    int32_t axis = 0;
-    int32_t pad_end = 0;
-    vsi_ssize_t length_samples = 0;
-    vsi_ssize_t num_frames = 0;
-    vsi_ssize_t inner_dim = 1;
-    vsi_ssize_t outer_dim = 1;
-    vsi_ssize_t inner_size = 1;
-    float pad_val = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
-    }
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[FRAME_LENGHT], &frame_length);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[FRAME_STEP], &frame_step);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[AXIS], &axis);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[PAD_END], &pad_end);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[PAD_VAL], &pad_val);
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    for (i = 0; i < axis; i++)
-    {
-        inner_dim *= in_attr[0]->shape->data[i];
-    }
-    length_samples = in_attr[0]->shape->data[axis];
-    for (i = axis + 1; i < (int32_t)in_attr[0]->shape->size; i++)
-    {
-        outer_dim *= in_attr[0]->shape->data[i];
-    }
-
-    for (i = 0; i < axis + 1; i++)
-    {
-        inner_size *= out_attr[0]->shape->data[i];
-    }
-
-    num_frames = (length_samples + frame_step - 1) / frame_step;
-    num_frames = pad_end ? num_frames : (length_samples - frame_length) / frame_step + 1;
-
-    for (i = 0; i < outer_dim; i++)
-    {
-        float * src_ptr = f32_in_buffer[0] + i * length_samples * inner_dim;
-        float * dst_ptr = f32_out_buffer[0] + i * num_frames * frame_length * inner_dim;
-
-        for (j = 0; j < num_frames; j++)
-        {
-            for (k = 0; k < frame_length; k++)
-            {
-                int32_t m = j * frame_step + k;
-
-                if (pad_end)
-                {
-                    if (m >= length_samples)
-                    {
-                        int32_t l = 0;
-                        for (l = 0; l < inner_dim; l++)
-                        {
-                            (dst_ptr + (j * frame_length + k) * inner_dim)[l] = pad_val;
-                        }
-                    }
-                    else
-                    {
-                        memcpy(dst_ptr + (j * frame_length + k) * inner_dim, src_ptr + m * inner_dim,
-                            inner_dim * sizeof(float));
-                    }
-                }
-                else
-                {
-                    memcpy(dst_ptr + (j * frame_length + k) * inner_dim, src_ptr + m * inner_dim,
-                        inner_dim * sizeof(float));
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _signal_frame_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _signal_frame_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t frame_length  = vsi_nn_kernel_param_get_int32( params, "frame_length" );
-    int32_t frame_step  = vsi_nn_kernel_param_get_int32( params, "frame_step" );
-    int32_t axis  = vsi_nn_kernel_param_get_int32( params, "axis" );
-    int32_t pad_end  = vsi_nn_kernel_param_get_int32( params, "pad_end" );
-    float pad_val  = vsi_nn_kernel_param_get_float32( params, "pad_val" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            node_params[FRAME_LENGHT] = vsi_nn_kernel_scalar_create( graph, I32, &frame_length );
-            node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create( graph, I32, &frame_step );
-            node_params[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
-            node_params[PAD_END] = vsi_nn_kernel_scalar_create( graph, I32, &pad_end );
-            node_params[PAD_VAL] = vsi_nn_kernel_scalar_create( graph, F32, &pad_val );
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[FRAME_LENGHT] );
-            vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] );
-            vsi_nn_kernel_scalar_release( &node_params[AXIS] );
-            vsi_nn_kernel_scalar_release( &node_params[PAD_END] );
-            vsi_nn_kernel_scalar_release( &node_params[PAD_VAL] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( signal_frame, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c b/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c
deleted file mode 100644
index 11d475c..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-    /*
-    * Define kernel meta.
-    */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.slice")
-
-
-    /*
-    * Kernel params
-    */
-    static vx_param_description_t _slice_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _SLICE_PARAM_NUM  _cnt_of_array( _slice_kernel_param_def )
-
-
-/*
-* Kernel function
-*/
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    int32_t  rank = 0;
-    int32_t  i = 0;
-    vsi_ssize_t  in_w = 0;
-    vsi_ssize_t  in_h = 0;
-    vsi_ssize_t  in_c = 0;
-    vsi_ssize_t  in_b = 0;
-    vsi_ssize_t start[4] = {0};
-    vsi_ssize_t stop[4] = {0};
-    vsi_ssize_t in_size[4] = {1, 1, 1, 1};
-    vsi_ssize_t out_size[4] = {1, 1, 1, 1};
-    float *input_ptr = NULL;
-    float *output_ptr = NULL;
-    int32_t dstIdx = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    rank = (int32_t)out_attr[0]->shape->size;
-
-    for (i = 0; i < rank; i++)
-    {
-        in_size[i] = in_attr[0]->shape->data[i];
-        out_size[i] = out_attr[0]->shape->data[i];
-    }
-
-    start[0] = (vsi_ssize_t)f32_in_buffer[1][0];
-    stop[0] = start[0] + out_attr[0]->shape->data[0];
-    start[1] = rank < 2 ? 0 : (vsi_ssize_t)f32_in_buffer[1][1];
-    stop[1] = rank < 2 ? 1 : start[1] + out_size[1];
-    start[2] = rank < 3 ? 0 : (vsi_ssize_t)f32_in_buffer[1][2];
-    stop[2] = rank < 3 ? 1 : start[2] + out_size[2];
-    start[3] = rank < 4 ? 0 : (vsi_ssize_t)f32_in_buffer[1][3];
-    stop[3] = rank < 4 ? 1 : start[3] + out_size[3];
-    input_ptr = f32_in_buffer[0];
-    output_ptr = f32_out_buffer[0];
-
-    for (in_b = start[3]; in_b < stop[3]; ++in_b)
-    {
-        for (in_c = start[2]; in_c < stop[2]; ++in_c)
-        {
-            for (in_h = start[1]; in_h < stop[1]; ++in_h)
-            {
-                for (in_w = start[0]; in_w < stop[0]; ++in_w)
-                {
-                    vsi_ssize_t srcIdx = ((in_b * in_size[2] + in_c) * in_size[1] + in_h) * in_size[0] + in_w;
-                    output_ptr[dstIdx ++] = input_ptr[srcIdx];
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-            f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
-* Query kernel
-*/
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _slice_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _slice_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM,
-                inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( slice, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
deleted file mode 100644
index 0a518b0..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2019 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _CPU_ARG_NUM            (2)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.space2depth_internal")
-
-DEF_KERNEL_EXECUTOR(_space2depth_internal_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[2] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    uint32_t i = 0;
-    int32_t block_size_x = 1;
-    int32_t block_size_y = 1;
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size_x);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size_y);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    {
-        vsi_size_t output_depth = attr[1]->shape->data[2];
-        vsi_size_t output_height = attr[1]->shape->data[1];
-        vsi_size_t output_width = attr[1]->shape->data[0];
-        vsi_size_t input_batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
-        vsi_size_t input_depth = attr[0]->shape->data[2];
-        vsi_size_t input_height = attr[0]->shape->data[1];
-        vsi_size_t input_width = attr[0]->shape->data[0];
-        vsi_size_t batch = 0, in_h = 0, in_w = 0;
-
-        for (batch = 0; batch < input_batch; ++ batch)
-        {
-            vsi_size_t output_batch_index = batch * output_height * output_width * output_depth;
-            vsi_size_t input_batch_index = batch * input_height * input_width * input_depth;
-            vsi_size_t in_d = 0;
-
-            for (in_d = 0; in_d < input_depth; in_d ++)
-            {
-                for (in_h = 0; in_h < input_height; ++ in_h)
-                {
-                    for (in_w = 0; in_w < input_width; in_w ++)
-                    {
-                        vsi_size_t out_w = in_w / block_size_x;
-                        vsi_size_t out_h = in_h / block_size_y;
-                        vsi_size_t out_d = (in_w  % block_size_x) * input_depth
-                                            + (in_h % block_size_y) * block_size_x * input_depth + in_d;
-
-                        vsi_size_t in_index = in_w + in_h * input_width
-                                            + in_d * input_height * input_width + input_batch_index;
-                        vsi_size_t out_index = out_w + out_h * output_width
-                                            +  out_d * output_width * output_height + output_batch_index;
-
-                        buffer[1][out_index] = buffer[0][in_index];
-                    }
-                }
-            }
-        }
-    }
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
-        if ( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-    }
-    return status;
-} /* _depth2space_crd_exec() */
-/*
- * Kernel params
- */
-static vx_param_description_t _space2depth_internal_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _DEPTH2SPACE_CRD_PARAM_NUM  _cnt_of_array( _space2depth_internal_kernel_param_def )
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _space2depth_internal_exec;
-    kernel->info.parameters  = _space2depth_internal_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _space2depth_internal_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VX_FAILURE;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            uint32_t index = 2;
-            int32_t block_size_x  = vsi_nn_kernel_param_get_int32( params, "block_size_x" );
-            int32_t block_size_y  = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
-
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-
-            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_x );
-            backend_params[index] = vsi_nn_kernel_scalar_create( graph, I32, &block_size_y );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-            CHECK_STATUS( status );
-            vsi_nn_kernel_scalar_release( &backend_params[2] );
-            vsi_nn_kernel_scalar_release( &backend_params[3] );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( space2depth_internal, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c b/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c
deleted file mode 100644
index 243294c..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c
+++ /dev/null
@@ -1,388 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.spatial_transformer")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _spatial_transformer_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _SPATIAL_TRANSFORMER_PARAM_NUM  _cnt_of_array( _spatial_transformer_kernel_param_def )
-#define HAS_THETA_1_1   (3)
-#define HAS_THETA_1_2   (4)
-#define HAS_THETA_1_3   (5)
-#define HAS_THETA_2_1   (6)
-#define HAS_THETA_2_2   (7)
-#define HAS_THETA_2_3   (8)
-#define THETA_1_1       (9)
-#define THETA_1_2       (10)
-#define THETA_1_3       (11)
-#define THETA_2_1       (12)
-#define THETA_2_2       (13)
-#define THETA_2_3       (14)
-#define ALIGN_CORNERS   (15)
-
-static void _transform_affine(int32_t dst_x, int32_t dst_y, const float m[], float *src_x, float *src_y)
-{
-    *src_x = dst_x * m[0] + dst_y * m[2] + m[4];
-    *src_y = dst_x * m[1] + dst_y * m[3] + m[5];
-}
-
-static float _read_pixel(float *base, vsi_nn_kernel_tensor_attr_t *attr,
-                          float x, float y, int32_t z, int32_t b)
-{
-    vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= attr->shape->data[0] || y >= attr->shape->data[1]);
-    vsi_ssize_t bx, by;
-    vsi_ssize_t offset = (b * attr->shape->data[2] + z) * attr->shape->data[0] * attr->shape->data[1];
-    float pixel = 0;
-
-    if (out_of_bounds)
-    {
-        return 0;
-    }
-    // bounded x/y
-    bx = (vsi_ssize_t)x;
-    by = (vsi_ssize_t)y;
-
-    pixel = base[attr->shape->data[0] * by + bx + offset];
-
-    return pixel;
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    int32_t  i = 0;
-    int32_t  b = 0;
-    int32_t  c = 0;
-    int32_t  j = 0;
-    int32_t  x = 0;
-    int32_t  y = 0;
-    int32_t  has_theta[6] = {0};
-    vsi_ssize_t  batch = 1;
-    vsi_ssize_t  depth = 1;
-    vsi_ssize_t  height = 1;
-    vsi_ssize_t  width = 1;
-    vsi_ssize_t  input_height = 1;
-    vsi_ssize_t  input_width = 1;
-    int32_t  rank = 0;
-    int32_t  index = 0;
-    int32_t  align_corners = 0;
-    float    theta[6] = {0};
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
-    }
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_1], &has_theta[0]);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_2], &has_theta[1]);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_3], &has_theta[2]);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_1], &has_theta[3]);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_2], &has_theta[4]);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_3], &has_theta[5]);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_1], &theta[0]);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_2], &theta[1]);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_3], &theta[2]);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_1], &theta[3]);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_2], &theta[4]);
-    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_3], &theta[5]);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[ALIGN_CORNERS], &align_corners);
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    rank = (int32_t)out_attr[0]->shape->size;
-    width = out_attr[0]->shape->data[0];
-    height = out_attr[0]->shape->data[1];
-    depth = rank > 2 ? out_attr[0]->shape->data[2] : 1;
-    batch = rank > 3 ? out_attr[0]->shape->data[3] : 1;
-
-    input_width = in_attr[0]->shape->data[0];
-    input_height = in_attr[0]->shape->data[1];
-
-    for (b = 0; b < batch; b++)
-    {
-        float _w = (float)input_width;
-        float _h = (float)input_height;
-        float w = (float)width;
-        float h = (float)height;
-        float matrix_m[6] = {0};
-        j = 0;
-        for (i = 0; i < 6; i++)
-        {
-            if (has_theta[i] == 0)
-            {
-                theta[i] = f32_in_buffer[1][b * in_attr[1]->shape->data[0] + j];
-                j ++;
-            }
-        }
-
-        if (align_corners && w > 1)
-        {
-            w = w - 1;
-        }
-
-        if (align_corners && h > 1)
-        {
-            h = h - 1;
-        }
-
-        matrix_m[0] = theta[4] * _w / w;
-        matrix_m[2] = theta[3] * _w / h;
-        matrix_m[4] = (theta[5] - theta[4] - theta[3] + 1) * _w * 0.5f;
-        matrix_m[1] = theta[1] * _h / w;
-        matrix_m[3] = theta[0] * _h / h;
-        matrix_m[5] = (theta[2] - theta[1] - theta[0] + 1) * _h * 0.5f;
-        for (c = 0; c < depth; c++)
-        {
-            for (y = 0; y < height; y++)
-            {
-                for (x = 0; x < width; x++)
-                {
-                    float xf = 0;
-                    float yf = 0;
-                    float tl = 0, tr = 0, bl = 0, br = 0;
-                    float ar = 0, ab = 0, al = 0, at = 0;
-
-                    _transform_affine(x, y, matrix_m, &xf, &yf);
-
-                    xf = xf < 0 ? xf - 1 : xf;
-                    yf = yf < 0 ? yf - 1 : yf;
-                    ar = xf - floorf(xf);
-                    ab = yf - floorf(yf);
-                    al = 1.0f - ar;
-                    at = 1.0f - ab;
-
-                    tl = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf), floorf(yf), c, b);
-                    tr = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf) + 1, floorf(yf), c, b);
-                    bl = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf), floorf(yf) + 1, c, b);
-                    br = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf) + 1, floorf(yf) + 1, c, b);
-
-                    f32_out_buffer[0][index ++] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _spatial_transformer_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _spatial_transformer_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_SPATIAL_TRANSFORMER_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t has_theta_1_1  = vsi_nn_kernel_param_get_int32( params, "has_theta_1_1" );
-    int32_t has_theta_1_2  = vsi_nn_kernel_param_get_int32( params, "has_theta_1_2" );
-    int32_t has_theta_1_3  = vsi_nn_kernel_param_get_int32( params, "has_theta_1_3" );
-    int32_t has_theta_2_1  = vsi_nn_kernel_param_get_int32( params, "has_theta_2_1" );
-    int32_t has_theta_2_2  = vsi_nn_kernel_param_get_int32( params, "has_theta_2_2" );
-    int32_t has_theta_2_3  = vsi_nn_kernel_param_get_int32( params, "has_theta_2_3" );
-    float theta_1_1  = vsi_nn_kernel_param_get_float32( params, "theta_1_1" );
-    float theta_1_2  = vsi_nn_kernel_param_get_float32( params, "theta_1_2" );
-    float theta_1_3  = vsi_nn_kernel_param_get_float32( params, "theta_1_3" );
-    float theta_2_1  = vsi_nn_kernel_param_get_float32( params, "theta_2_1" );
-    float theta_2_2  = vsi_nn_kernel_param_get_float32( params, "theta_2_2" );
-    float theta_2_3  = vsi_nn_kernel_param_get_float32( params, "theta_2_3" );
-    int32_t align_corners  = vsi_nn_kernel_param_get_int32( params, "align_corners" );
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _SPATIAL_TRANSFORMER_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 );
-            node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 );
-            node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 );
-            node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 );
-            node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 );
-            node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 );
-            node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 );
-            node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 );
-            node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 );
-            node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 );
-            node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 );
-            node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 );
-            node_params[ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SPATIAL_TRANSFORMER_PARAM_NUM );
-            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] );
-            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] );
-            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] );
-            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] );
-            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] );
-            vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] );
-            vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] );
-            vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] );
-            vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] );
-            vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] );
-            vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] );
-            vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] );
-            vsi_nn_kernel_scalar_release( &node_params[ALIGN_CORNERS] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( spatial_transformer, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c b/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c
deleted file mode 100644
index 65d4a4c..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c
+++ /dev/null
@@ -1,294 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_math.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _CPU_IO_NUM         (_INPUT_NUM + _OUTPUT_NUM)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.swish")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _swish_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _SWISH_PARAM_NUM  _cnt_of_array( _swish_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_swish_compute)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    float    beta = 1.0f;
-    uint32_t  i;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[_CPU_IO_NUM], &(beta));
-
-    /* TODO: Add CPU kernel implement */
-    /* example code : copy data form input tensor to output tensor*/
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        float val = f32_in_buffer[0][i];
-        f32_out_buffer[0][i] = val * 1.0f / (1.0f + (float)exp(beta * val * (-1.0f)));
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-    }
-
-    return status;
-} /* _swish_compute() */
-
-DEF_KERNEL_EXECUTOR(_hswish_compute)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    uint32_t  i;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        f32_out_buffer[i] = (float *)malloc( out_elements[i] * sizeof(float) );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_elements[i] * sizeof(float) );
-    }
-
-    /* TODO: Add CPU kernel implement */
-    /* example code : copy data form input tensor to output tensor*/
-
-
-    for (i = 0; i < out_elements[0]; i++)
-    {
-        float val = f32_in_buffer[0][i];
-        f32_out_buffer[0][i] = val * vsi_nn_clamp((val + 3.0f), 0.0f, 6.0f) / 6.0f;
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-    }
-
-    return status;
-} /* _hswish_compute() */
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs,
-    vsi_nn_swish_type swish_type
-    /* Add extra params */
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    if (VSI_NN_SWISH == swish_type)
-    {
-        kernel->info.function    = _swish_compute;
-    }
-    else
-    {
-        kernel->info.function    = _hswish_compute;
-    }
-    kernel->info.parameters  = _swish_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _swish_kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_SWISH_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    int32_t swish_type  = vsi_nn_kernel_param_get_int32( params, "type" );
-    float   beta        = vsi_nn_kernel_param_get_float32( params, "beta" );
-
-    status = _query_kernel( kernel, inputs, outputs,  (vsi_nn_swish_type)swish_type);
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _SWISH_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[_CPU_IO_NUM] = vsi_nn_kernel_scalar_create( graph, F32, &beta );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SWISH_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[_CPU_IO_NUM] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( swish, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c b/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c
deleted file mode 100644
index ee6c564..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/sync_host_cpu.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.sync_host")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _sync_host_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _SYNC_HOST_PARAM_NUM  _cnt_of_array( _sync_host_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    void *in_buffer[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        in_buffer[i] = vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], FALSE );
-        CHECK_PTR_FAIL_GOTO( in_buffer[i], "Create input buffer fail.", final );
-    }
-
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        out_bytes[i] = vsi_nn_kernel_tensor_attr_get_bytes( out_attr[i] );
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
-                in_buffer[i], out_bytes[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (in_buffer[i])
-        {
-            free(in_buffer[i]);
-            in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _sync_host_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _sync_host_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_SYNC_HOST_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _SYNC_HOST_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SYNC_HOST_PARAM_NUM );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( sync_host, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c
deleted file mode 100644
index 60b33ac..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.tensorstackconcat")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _tensorstackconcat_kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _TENSORSTACKCONCAT_PARAM_NUM  _cnt_of_array( _tensorstackconcat_kernel_param_def )
-
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    size_t   out_elements[_OUTPUT_NUM] = {0};
-    uint32_t  i = 0;
-    vsi_size_t  depth = 0;
-    vsi_size_t  height = 1;
-    vsi_size_t  width = 0;
-    vsi_size_t  index = 0;
-    uint32_t  c = 0, y = 0, x = 0;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        f32_out_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( output[i], out_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-    }
-
-    depth = in_attr[0]->shape->data[2];
-    height = in_attr[0]->shape->data[1];
-    width = in_attr[0]->shape->data[0];
-    index = (vsi_size_t)f32_in_buffer[1][0];
-
-    for (c = 0; c < depth; c++)
-    {
-        for (y = 0; y < height; y++)
-        {
-            for (x = 0; x < width; x++)
-            {
-                vsi_ssize_t i_idx = c * width * height + y * width + x;
-                vsi_ssize_t o_idx = (c * out_attr[0]->shape->data[1] + index ) * out_attr[0]->shape->data[0] + x;
-                float value = f32_in_buffer[0][i_idx];
-
-                f32_out_buffer[0][o_idx] = value;
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _tensorstackconcat_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _tensorstackconcat_kernel_param_def );
-
-    status = VSI_SUCCESS;
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_TENSORSTACKCONCAT_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _TENSORSTACKCONCAT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TENSORSTACKCONCAT_PARAM_NUM );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( tensorstackconcat, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c
deleted file mode 100644
index 3126c31..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-#define _CPU_ARG_NUM            (0)
-#define _CPU_INPUT_NUM          (1)
-#define _CPU_OUTPUT_NUM         (1)
-#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
-#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
-#define _KERNEL_NAME            CVIVANTE_NAMESPACE("tile_sw")
-
-void copyMultipleTimes(const float* in_data, vsi_size_t in_size, int32_t multiplier, float* out_data)
-{
-    int i = 0;
-
-    for ( i = 0; i < multiplier; ++i)
-    {
-        memcpy(out_data, in_data, in_size * sizeof(float));
-        out_data += in_size;
-    }
-}
-
-void tileOneDimension(const vsi_size_array_t* input_shape, const float* in_data,
-                      const uint32_t* multipliers, float* out_data, int dimension,
-                      vsi_size_t *stride_size, vsi_size_t *tiled_stride_size)
-{
-    vsi_size_t i = 0;
-    const vsi_size_t dimension_size = input_shape->data[dimension];
-    vsi_ssize_t total_stride_size = 0, total_tiled_stride_size = 0;
-    const float* copy_from_data = in_data;
-    float* copy_to_data = out_data;
-
-    if (dimension == 0)
-    {
-        copyMultipleTimes(in_data, dimension_size, multipliers[dimension], out_data);
-        *stride_size = dimension_size;
-        *tiled_stride_size = dimension_size * multipliers[dimension];
-        return ;
-    }
-
-    for (i = 0; i < dimension_size; ++i)
-    {
-        tileOneDimension(
-                input_shape, copy_from_data, multipliers, copy_to_data, dimension - 1, stride_size, tiled_stride_size);
-        copy_from_data += *stride_size;
-        copy_to_data += *tiled_stride_size;
-        total_stride_size += *stride_size;
-        total_tiled_stride_size += *tiled_stride_size;
-    }
-
-    copyMultipleTimes(out_data, total_tiled_stride_size, multipliers[dimension] - 1,
-                      out_data + total_tiled_stride_size);
-
-    *stride_size = total_stride_size;
-    *tiled_stride_size = total_tiled_stride_size * multipliers[dimension];
-    return ;
-}
-
-
-DEF_KERNEL_EXECUTOR(_tile_exec)
-    (
-    vsi_nn_kernel_node_t node,
-    const vsi_nn_kernel_node_param_t * param,
-    size_t param_size
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
-    float * buffer[_CPU_IO_NUM] = { NULL };
-    size_t out_elements = 0;
-    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
-    vsi_size_t i = 0;
-    uint32_t multiples[VSI_NN_MAX_DIM_NUM] = {0};
-    vsi_size_t stride_size = 0, tiled_stride_size = 0;
-
-
-    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
-    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
-
-    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
-
-    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
-
-    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
-    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
-
-    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
-    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
-    memset( buffer[1], 0, out_elements * sizeof(float) );
-
-    for (i = 0; i < attr[0]->shape->size; i++)
-    {
-        multiples[i] = (uint32_t)(attr[1]->shape->data[i] / attr[0]->shape->data[i]);
-    }
-
-    tileOneDimension(attr[0]->shape, buffer[0], multiples, buffer[1],
-        (int32_t)attr[0]->shape->size - 1, &stride_size, &tiled_stride_size);
-
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    for( i = 0; i < _CPU_IO_NUM; i ++ )
-    {
-        if( buffer[i] )
-        {
-            free( buffer[i] );
-        }
-        vsi_nn_kernel_tensor_attr_release( &attr[i] );
-    }
-
-    return status;
-} /* _tile_exec() */
-
-static vx_param_description_t kernel_param_def[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
-};
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_tensor_t* const* const inputs,
-    vsi_nn_tensor_t* const* const outputs,
-    vsi_nn_kernel_t* kernel
-    )
-{
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _tile_exec;
-    kernel->info.parameters  = kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
-
-    return VSI_SUCCESS;
-} /* _query_kernel() */
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node = NULL;
-
-    status = _query_kernel( inputs, outputs, kernel );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
-                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
-            /* Pass parameters to node. */
-            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
-        }
-        else
-        {
-            status = VSI_FAILURE;
-        }
-    }
-    return node;
-} /* _setup() */
-
-REGISTER_BACKEND_CPU( tile, _setup )
-
-__END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
deleted file mode 100644
index 8c04a2a..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
+++ /dev/null
@@ -1,284 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (2)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.topk")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _topk_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-    // Add kererl parameters here
-};
-#define _TOPK_PARAM_NUM  _cnt_of_array( _topk_kernel_param_def )
-
-static void _find_top_k_1d
-(
-    float* input,
-    uint32_t input_len,
-    uint32_t k,
-    float* value,
-    uint32_t* indices
-)
-{   // Insertion sort
-    float insert_elem;
-    uint32_t position,index=0;
-    uint32_t i, j;
-    for (i = 0; i < input_len; i++)
-    {
-        insert_elem = input[i];
-        // Record the position of the target element,
-        // and start traversing from this position forward
-        position = i;
-        index = position;
-        // Traverse forward from position to find the insertion position of the target element
-        while (position > 0 && input[position - 1] < insert_elem)
-        {
-            // The element at position moves one position backward, index will also move with it
-            input[position] = input[position - 1];
-            indices[position] = indices[position - 1];
-            position--;
-        }
-        // Insert and record the final position
-        if (position != i)
-        {
-            input[position] = insert_elem;
-        }
-        indices[position] = index;
-    }
-    for (j = 0; j < k; j++)
-    {
-        value[j] = input[j];
-    }
-}
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    uint32_t  i = 0;
-    int32_t  j = 0;
-    int32_t  top_k = 0;
-    uint32_t block_num = 1;
-    uint32_t block_size = 0;
-    uint32_t * indices_ptr = NULL;
-
-    /* prepare data */
-    for (i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-
-    for (i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status = vsi_nn_kernel_scalar_read_int32( param[3], &top_k );
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    for(i = (uint32_t)in_attr[0]->shape->size - 1; i > 0; i--)
-    {
-        block_num = block_num * (uint32_t)in_attr[0]->shape->data[i];
-    }
-
-    block_size = (uint32_t)in_attr[0]->shape->data[0];
-    indices_ptr = (uint32_t*)malloc(block_size * sizeof(uint32_t));
-    CHECK_PTR_FAIL_GOTO( indices_ptr, "Create indices buffer fail.", final );
-
-    for(i = 0; i < block_num; i++)
-    {
-        uint32_t in_index = i * block_size;
-        uint32_t out_index = i * top_k;
-        _find_top_k_1d(&(f32_in_buffer[0][in_index]),
-            block_size, top_k, &(f32_out_buffer[0][out_index]), indices_ptr);
-
-        for (j = 0; j < top_k; j++)
-        {
-            f32_out_buffer[1][out_index + j] = (float)indices_ptr[j];
-        }
-    }
-    // Handle the 1D input
-    if (!block_num)
-    {
-        _find_top_k_1d(&(f32_in_buffer[0][0]),
-            block_size, top_k, &(f32_out_buffer[0][0]), indices_ptr);
-        for (j = 0; j < top_k; j++)
-        {
-            f32_out_buffer[1][j] = (float)indices_ptr[j];
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    vsi_nn_safe_free(indices_ptr);
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for (i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _topk_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _topk_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if ( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &top_k );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &node_params[3] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( topk, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c
deleted file mode 100644
index 77b16a2..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (2)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.upsample")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _upsample_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _UPSAMPLE_PARAM_NUM  _cnt_of_array( _upsample_kernel_param_def )
-
-#define SCALAR_KSZIE_X          (3)
-#define SCALAR_KSZIE_Y          (4)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    vsi_ssize_t  i, j, b, p;
-    vsi_ssize_t  batch, depth, height, width, height_o, width_o;
-    vsi_ssize_t  input_base  = 0;
-    vsi_ssize_t  output_base = 0;
-    int32_t  ksize_x     = 0;
-    int32_t  ksize_y     = 0;
-    vsi_bool is_relative_coord = FALSE;
-    vsi_nn_kernel_dtype_e input1_dtype;
-
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        if (1 == i)
-        {
-            in_attr[1]->quant = VSI_NN_KERNEL_QUANT_NONE;
-        }
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_KSZIE_X],  &ksize_x);
-    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_KSZIE_Y],  &ksize_y);
-
-    batch    = in_attr[0]->shape->size > 3 ? in_attr[0]->shape->data[3] : 1;
-    depth    = in_attr[0]->shape->size > 2 ? in_attr[0]->shape->data[2] : 1;
-    width    = in_attr[0]->shape->data[0];
-    height   = in_attr[0]->shape->data[1];
-    width_o  = out_attr[0]->shape->data[0];
-    height_o = out_attr[0]->shape->data[1];
-    input1_dtype = in_attr[1]->dtype;
-
-    if ((I8 == input1_dtype) || (U8 == input1_dtype) || (I16 == input1_dtype))
-    {
-        is_relative_coord = TRUE;
-    }
-
-
-    for(b = 0; b < batch; b++)
-    {
-        for (p = 0; p < depth; p ++)
-        {
-            input_base  = b * depth * height * width + p * height * width;
-            output_base = b * depth * height_o * width_o + p * height_o * width_o;
-            for (j = 0; j < height; j ++)
-            {
-                for (i = 0; i < width; i ++)
-                {
-                    vsi_ssize_t in_index  = input_base + j * width + i;
-                    float   in_value  = f32_in_buffer[0][in_index];
-                    vsi_ssize_t up_index  = (vsi_ssize_t)f32_in_buffer[1][in_index];
-                    vsi_ssize_t out_index = up_index;
-                    if (is_relative_coord)
-                    {
-                        vsi_ssize_t relative_y = up_index / ksize_x;
-                        vsi_ssize_t relative_x = up_index % ksize_x;
-                        out_index = output_base + ((j * ksize_y) + relative_y) * width_o + i * ksize_x + relative_x;
-                    }
-                    f32_out_buffer[0][out_index] = in_value;
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _upsample_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _upsample_kernel_param_def );
-    status = VSI_SUCCESS;
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_UPSAMPLE_PARAM_NUM] = {NULL};
-    int32_t  scale_x  = 0;
-    int32_t  scale_y  = 0;
-    vsi_nn_kernel_node_t node = NULL;
-
-    scale_x  = vsi_nn_kernel_param_get_int32(params, "scale_x");
-    scale_y  = vsi_nn_kernel_param_get_int32(params, "scale_y");
-
-    status = _query_kernel( kernel, inputs, outputs );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLE_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_KSZIE_X] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &scale_x );
-            node_params[SCALAR_KSZIE_Y] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &scale_y );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLE_PARAM_NUM );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_KSZIE_X] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_KSZIE_Y] );
-        }
-    }
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( upsample, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
deleted file mode 100644
index b3b4bb4..0000000
--- a/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.upsamplescale")
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _upsamplescale_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    // Add kererl parameters here
-};
-#define _UPSAMPLESCALE_PARAM_NUM  _cnt_of_array( _upsamplescale_kernel_param_def )
-
-#define SCALAR_STRIDE_VALUE          (2)
-#define SCALAR_SCALE_VALUE           (3)
-
-/*
- * Kernel function
- */
-DEF_KERNEL_EXECUTOR(_compute)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
-    float *f32_in_buffer[_INPUT_NUM] = {NULL};
-    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
-    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
-    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
-    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
-    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
-    vsi_ssize_t  i = 0;
-    int32_t  stride = 0;
-    float    scale = 0.0f;
-    vsi_ssize_t width = 0;
-    vsi_ssize_t height = 0;
-    vsi_ssize_t out_width = 0;
-    vsi_ssize_t out_height = 0;
-    vsi_ssize_t outerSize = 1;
-    vsi_ssize_t x = 0;
-    vsi_ssize_t y = 0;
-
-    /* prepare data */
-    for(i = 0; i < _INPUT_NUM; i ++)
-    {
-        input[i] = (vsi_nn_kernel_tensor_t)param[i];
-        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
-        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
-        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
-    }
-    for(i = 0; i < _OUTPUT_NUM; i ++)
-    {
-        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
-        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
-        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
-        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
-        out_bytes[i] = out_elements[i] * sizeof(float);
-        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
-        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
-        memset( f32_out_buffer[i], 0, out_bytes[i] );
-    }
-
-    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_STRIDE_VALUE], &stride);
-    vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &scale);
-
-    width = in_attr[0]->shape->data[0];
-    height = in_attr[0]->shape->data[1];
-    for (i = 2; i < (vsi_ssize_t)in_attr[0]->shape->size; i++)
-    {
-        outerSize *= in_attr[0]->shape->data[i];
-    }
-
-    out_width = out_attr[0]->shape->data[0];
-    out_height = out_attr[0]->shape->data[1];
-
-    for (i = 0; i < outerSize; i++)
-    {
-        for (y = 0; y < height; y++)
-        {
-            for (x = 0; x < width; x++)
-            {
-                vsi_ssize_t in_idx = i * width * height + y * width + x;
-                vsi_ssize_t base_idx = i * out_width * out_height
-                    + y * stride * out_width + x * stride;
-                vsi_ssize_t dx = 0;
-                vsi_ssize_t dy = 0;
-                float data = f32_in_buffer[0][in_idx] * scale;
-
-                for (dy = 0; dy < stride; dy++)
-                {
-                    for (dx = 0; dx < stride; dx++)
-                    {
-                        vsi_ssize_t idx = base_idx + dy * out_width + dx;
-
-                        f32_out_buffer[0][idx] = data;
-                    }
-                }
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
-                f32_out_buffer[i], out_elements[i] );
-        CHECK_STATUS_FAIL_GOTO( status, final );
-    }
-
-final:
-    for (i = 0; i < _INPUT_NUM; i++)
-    {
-        if (f32_in_buffer[i])
-        {
-            free(f32_in_buffer[i]);
-            f32_in_buffer[i] = NULL;
-        }
-        if (in_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
-        }
-    }
-    for(i = 0; i < _OUTPUT_NUM; i++)
-    {
-        if (f32_out_buffer[i])
-        {
-            free(f32_out_buffer[i]);
-            f32_out_buffer[i] = NULL;
-        }
-        if (out_attr[i])
-        {
-            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
-        }
-    }
-
-    return status;
-} /* _compute() */
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    /* Add extra params */
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
-    kernel->info.function    = _compute;
-    kernel->info.parameters  = _upsamplescale_kernel_param_def;
-    kernel->info.numParams   = _cnt_of_array( _upsamplescale_kernel_param_def );
-
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_UPSAMPLESCALE_PARAM_NUM];
-    vsi_nn_kernel_node_t node = NULL;
-    int32_t stride = 0;
-    float scale = 1.0f;
-
-    stride = vsi_nn_kernel_param_get_int32(params, "stride");
-    scale = vsi_nn_kernel_param_get_float32(params, "scale");
-
-    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
-    if( VSI_SUCCESS == status)
-    {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _UPSAMPLESCALE_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-
-            node_params[SCALAR_STRIDE_VALUE] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &stride );
-            node_params[SCALAR_SCALE_VALUE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &scale );
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _UPSAMPLESCALE_PARAM_NUM );
-
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_STRIDE_VALUE] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_VALUE] );
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CPU( upsamplescale, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
new file mode 100644
index 0000000..01ea2ab
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
@@ -0,0 +1,726 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_BILINEAR_GRID_SAMPLE,
+} _internal_kernel_e;
+
+#define STR(a) #a
+
+#define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(_input_type, _output_type) \
+    "bilinear_grid_sample_" #_input_type "_to_" #_output_type
+
+// Add kernel hashtable here
+#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+        ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+        {                                                                   \
+        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
+            CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+            _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)     \
+        }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
+{
+    PACK_KERNEL_MAP(F16,  F32,  F16),
+    PACK_KERNEL_MAP(F16,  U8,   F16),
+    PACK_KERNEL_MAP(F16,  F16,  F16),
+    PACK_KERNEL_MAP(F16,  F32,  U8),
+    PACK_KERNEL_MAP(F16,  F16,  U8),
+    PACK_KERNEL_MAP(F16,  U8,   U8),
+    PACK_KERNEL_MAP(U8,   U8,   U8),
+    PACK_KERNEL_MAP(U8,   F16,  U8),
+    PACK_KERNEL_MAP(U8,   F32,  U8),
+    PACK_KERNEL_MAP(I16,  I16,  I16),
+    PACK_KERNEL_MAP(I8,   I8,   I8),
+    PACK_KERNEL_MAP(BF16, BF16, BF16),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _BILINEAR_GRID_SAMPLE_PARAM_NUM  _cnt_of_array( _bilinear_grid_sample_kernel_param_def )
+
+#define SCALAR_ALIGN_CORNERS (3)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+    vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
+    vsi_nn_kernel_tensor_attr_t* input_attr[2] = {NULL};
+    vsi_size_array_t* out_shape = NULL;
+    vsi_size_array_t* in0_shape = NULL;
+    vsi_nn_kernel_dtype_e input0_dtype = F16;
+    vsi_nn_kernel_dtype_e input1_dtype = F16;
+    vsi_nn_kernel_dtype_e output_dtype = F16;
+
+    uint32_t depth = 0;
+    float half_input0_wh[2];
+    float add_float_value[2];
+    uint32_t in0_width;
+    uint32_t in0_height;
+    uint32_t out_width;
+    uint32_t out_height;
+    int32_t align_corners;
+
+    int32_t src0FixPointPos = 0;
+    int32_t src1FixPointPos = 0;
+    int32_t dstFixPointPos  = 0;
+    float   input0_scale    = 1.0;
+    int32_t input0ZP        = 0;
+    float   input1_scale    = 1.0;
+    int32_t input1ZP        = 0;
+    float   output_scale    = 1.0;
+    int32_t outputZP        = 0;
+
+    input_attr[0] =
+        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[0]);
+    CHECK_PTR_FAIL_GOTO(
+        input_attr[0], "Create tensor attr buffer fail.", final);
+
+    input_attr[1] =
+        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[1]);
+    CHECK_PTR_FAIL_GOTO(
+        input_attr[1], "Create tensor attr buffer fail.", final);
+
+    output_attr =
+        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+    CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
+
+   status = vsi_nn_kernel_scalar_read_int32(
+        (vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    out_shape = output_attr->shape;
+    in0_shape = input_attr[0]->shape;
+    input0_dtype = input_attr[0]->dtype;
+    input1_dtype = input_attr[1]->dtype;
+    output_dtype = output_attr->dtype;
+
+    if (U8 == input0_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant) {
+        input0_scale = input_attr[0]->asymm.scale;
+        input0ZP     = input_attr[0]->asymm.zero_point;
+    } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant) {
+        src0FixPointPos = input_attr[0]->dfp.fl;
+        if (src0FixPointPos >= 0) {
+            input0_scale = 1.0f / (float)((int64_t)1 << src0FixPointPos);
+        } else if (src0FixPointPos < 0) {
+            input0_scale = (float)((int64_t)1 << -src0FixPointPos);
+        }
+        input0ZP = 0;
+    } else {
+        input0_scale = 1.0f;
+        input0ZP     = 0;
+    }
+
+    if (U8 == input1_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[1]->quant) {
+        input1_scale = input_attr[1]->asymm.scale;
+        input1ZP     = input_attr[1]->asymm.zero_point;
+    } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[1]->quant) {
+        src1FixPointPos = input_attr[1]->dfp.fl;
+        if (src1FixPointPos >= 0) {
+            input1_scale = 1.0f / (float)((int64_t)1 << src1FixPointPos);
+        } else if (src1FixPointPos < 0) {
+            input1_scale = (float)((int64_t)1 << -src1FixPointPos);
+        }
+        input1ZP = 0;
+    } else {
+        input1_scale = 1.0f;
+        input1ZP     = 0;
+    }
+
+    if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) {
+        output_scale = output_attr->asymm.scale;
+        outputZP = output_attr->asymm.zero_point;
+    } else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) {
+        dstFixPointPos = output_attr->dfp.fl;
+        if (dstFixPointPos >= 0) {
+            output_scale = (float)((int64_t)1 << dstFixPointPos);
+        } else if (dstFixPointPos < 0) {
+            output_scale = 1.0f / (float)((int64_t)1 << -dstFixPointPos);
+        }
+        outputZP = 0;
+    } else {
+        output_scale = 1.0;
+        outputZP = 0;
+    }
+
+
+    in0_width  = (uint32_t)(in0_shape->data[0]);
+    in0_height = (uint32_t)(in0_shape->data[1]);
+    depth      = (uint32_t)(in0_shape->data[2]);
+    out_width  = (uint32_t)(out_shape->data[0]);
+    out_height = (uint32_t)(out_shape->data[1]);
+
+    if (align_corners) {
+        half_input0_wh[0]  = ((float)in0_width - 1.0f) * 0.5f;
+        half_input0_wh[1]  = ((float)in0_height - 1.0f) * 0.5f;
+        add_float_value[0] = half_input0_wh[0];
+        add_float_value[1] = half_input0_wh[1];
+    } else {
+        half_input0_wh[0]  = (float)in0_width * 0.5f;
+        half_input0_wh[1]  = (float)in0_height * 0.5f;
+        add_float_value[0] = half_input0_wh[0] - 0.5f;
+        add_float_value[1] = half_input0_wh[1] - 0.5f;
+    }
+
+    status  = vsi_nn_kernel_gpu_add_param(node, "half_input0_wh", half_input0_wh);
+    status |= vsi_nn_kernel_gpu_add_param(node, "add_float_value", add_float_value);
+    status |= vsi_nn_kernel_gpu_add_param(node, "depth", &depth);
+
+    {
+        gpu_dp_inst_t uniFp16toFp32_part0_4x4 = {
+            {
+                0x01010101,  // TCfg
+                0x00000000,  // ASelt
+                0x00010000, 0x00030002,  // ABin
+                0x02020202,  // BSelt
+                0x00000000, 0x00000000,  // BBin
+                0x00000400,  // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000  // Constant
+            },
+            GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniFp16toFp32_part1_4x4 = {
+            {
+                0x01010101,  // TCfg
+                0x00000000,  // ASelt
+                0x00050004, 0x00070006,  // ABin
+                0x02020202,  // BSelt
+                0x00000000, 0x00000000,  // BBin
+                0x00000400,  // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000  // Constant
+            },
+            GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8SubZPtoFp32_part0_4x4 = {
+            {
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniU8SubZPtoFp32_part1_4x4 = {
+            {
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniEvenBintoFp32_4x4 = {
+            {
+                0x01010101,  // TCfg
+                0x00000000,  // ASelt
+                0x00020000, 0x00060004,  // ABin
+                0x02020202,  // BSelt
+                0x00000000, 0x00000000,  // BBin
+                0x00000100,  // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000  // Constant
+            }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniOddSubEvenBin_4x4 = {
+            {
+                0x09090909,  // TCfg
+                0x00000000,  // ASelt
+                0x00230001, 0x00670045,  // ABin
+                0x0a0a0a0a,  // BSelt
+                0x00000000, 0x00000000,  // BBin
+                0x00000100,  // AccumType, ConstantType, and PostShift
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000  // Constant
+            }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtactHalf8_2x8 = {
+            {
+                0x11111111,  // TCfg
+                0x11110000,  // ASelt
+                0x06040200, 0x06040200,  // ABin
+                0x22222222,  // BSelt
+                0x00000000, 0x00000000,  // BBin
+                0x00000100,  // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00  // Constant
+            }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniExtact8Bit_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        if (F16 == input0_dtype &&
+            (F16 == input1_dtype || F32 == input1_dtype ||
+             U8 == input1_dtype) &&
+            F16 == output_dtype) {
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniEvenBintoFp32_4x4", &uniEvenBintoFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniOddSubEvenBin_4x4", &uniOddSubEvenBin_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniExtactHalf8_2x8", &uniExtactHalf8_2x8);
+            if (F16 == input1_dtype) {
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+            } else if (U8 == input1_dtype) {
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "input1Scale", &input1_scale);
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node,
+                                                "uniU8SubZPtoFp32_part0_4x4",
+                                                &uniU8SubZPtoFp32_part0_4x4);
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node,
+                                                "uniU8SubZPtoFp32_part1_4x4",
+                                                &uniU8SubZPtoFp32_part1_4x4);
+            }
+        } else if (F16 == input0_dtype &&
+                   (F16 == input1_dtype || F32 == input1_dtype ||
+                    U8 == input1_dtype) &&
+                   U8 == output_dtype) {
+            float uint8Scale = 1.0f / output_scale;
+            float uint8ZP_out = (float)outputZP;
+            status |= vsi_nn_kernel_gpu_add_param(node, "uint8Scale", &uint8Scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &uint8ZP_out);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniEvenBintoFp32_4x4", &uniEvenBintoFp32_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniOddSubEvenBin_4x4", &uniOddSubEvenBin_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
+            if (U8 == input1_dtype) {
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "input1Scale", &input1_scale);
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node,
+                                                "uniU8SubZPtoFp32_part0_4x4",
+                                                &uniU8SubZPtoFp32_part0_4x4);
+                status |=
+                    vsi_nn_kernel_gpu_add_param(node,
+                                                "uniU8SubZPtoFp32_part1_4x4",
+                                                &uniU8SubZPtoFp32_part1_4x4);
+            } else if (F16 == input1_dtype) {
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+            }
+        }
+        else if (U8 == input0_dtype &&
+                   (F16 == input1_dtype || F32 == input1_dtype ||
+                    U8 == input1_dtype) &&
+                 U8 == output_dtype) {
+            float uint8Scale  = input0_scale / output_scale;
+            float uint8ZP_out = (float)outputZP;
+            gpu_dp_inst_t uniU8SubZPtoFp32_left_4x4 = {{
+                0x09090909, // TCfg
+                0x04040404, // ASelt
+                0x00020000, 0x00060004, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00000000, 0x00010001, 0x00000000,
+                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniU8RightSubLeft_4x4 = {{
+                0x09090909, // TCfg
+                0x00000000, // ASelt
+                0x00230001, 0x00670045, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00000000, 0x00010001, 0x00000000,
+                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP", &input0ZP);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uint8Scale", &uint8Scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &uint8ZP_out);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_left_4x4", &uniU8SubZPtoFp32_left_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniU8RightSubLeft_4x4", &uniU8RightSubLeft_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
+            if (U8 == input1_dtype) {
+                status |= vsi_nn_kernel_gpu_add_param(node, "input1_ZP", &input1ZP);
+                status |= vsi_nn_kernel_gpu_add_param(node, "input1Scale", &input1_scale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part0_4x4", &uniU8SubZPtoFp32_part0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniU8SubZPtoFp32_part1_4x4", &uniU8SubZPtoFp32_part1_4x4);
+            }
+            else if (F16 == input1_dtype) {
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part0_4x4", &uniFp16toFp32_part0_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(
+                    node, "uniFp16toFp32_part1_4x4", &uniFp16toFp32_part1_4x4);
+            }
+        }
+        else if (BF16 == input0_dtype && BF16 == input1_dtype &&
+                   BF16 == output_dtype) {
+            gpu_dp_inst_t uniBF16toFp32_part0_2x8 = {
+                {
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x01050004, 0x03070206, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniBF16toFp32_part1_2x8 = {
+                {
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x05050404, 0x07070606, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniConvBF16toF32_odd_2x8 = {{
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x02050004, 0x06070406, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniConvBF16toF32_even_2x8 = {{
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x03050104, 0x07070506, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16};
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniBF16toFp32_part0_2x8", &uniBF16toFp32_part0_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniBF16toFp32_part1_2x8", &uniBF16toFp32_part1_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniConvBF16toF32_odd_2x8", &uniConvBF16toF32_odd_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(
+                node, "uniConvBF16toF32_even_2x8", &uniConvBF16toF32_even_2x8);
+        }
+        else if (((I16 == input0_dtype && I16 == input1_dtype &&
+                    I16 == output_dtype)) ||
+                   ((I8 == input0_dtype && I8 == input1_dtype &&
+                     I8 == output_dtype))) {
+            float dfpScale = input0_scale * output_scale;
+            gpu_dp_inst_t uniDFPtoFp32_part0_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000300, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniDFPtoFp32_part1_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000300, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00020000, 0x00060004, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000000, 0x00000001, 0x00000000,
+                0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniRightSubLeft_4x4 = {{
+                0x09090909, // TCfg
+                0x00000000, // ASelt
+                0x00230001, 0x00670045, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00000000, 0x00010001, 0x00000000,
+                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            status |= vsi_nn_kernel_gpu_add_param(node, "input1_scale", &input1_scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "dfpScale", &dfpScale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniDFPtoFp32_part0_4x4", &uniDFPtoFp32_part0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniDFPtoFp32_part1_4x4", &uniDFPtoFp32_part1_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
+        }
+        else {
+            VSILOGE("input or output's format is not support");
+            status = VSI_FAILURE;
+        }
+    }
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    gpu_param.global_scale[0] = 4;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.dim = 2;
+    gpu_param.global_size[0] = gpu_align_p2(
+        (out_width + gpu_param.global_scale[0] - 1) /
+         gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = ((out_height + gpu_param.global_scale[1] - 1) /
+         gpu_param.global_scale[1]);
+
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+
+    final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR)               \
+    if (_PTR) {                                   \
+        vsi_nn_kernel_tensor_attr_release(&_PTR); \
+        _PTR = NULL;                              \
+    }
+        SAFE_FREE_TENSOR_ATTR(output_attr);
+        SAFE_FREE_TENSOR_ATTR(input_attr[0]);
+        SAFE_FREE_TENSOR_ATTR(input_attr[1]);
+
+        return status;
+
+} /* _bilinear_grid_sample_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype, in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _bilinear_grid_sample_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _bilinear_grid_sample_kernel_map );
+    vx_param_description_t * param_def  = _bilinear_grid_sample_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _bilinear_grid_sample_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _bilinear_grid_sample_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+    uint32_t final_in1_rank = 0;
+    vsi_nn_tensor_t* rs_tensors = NULL;
+    vsi_nn_tensor_t* final_tensors[3] = {NULL};
+    vsi_nn_kernel_dtype_e in0_dtype;
+    uint32_t pad_val = 0;
+    int32_t align_corners =
+        vsi_nn_kernel_param_get_int32(params, "align_corners");
+
+    // Check if gpu can support the size
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
+                                       inputs[0]->attr.dim_num)) {
+        return NULL;
+    }
+
+    if (!vsi_nn_kernel_gpu_check_shape(inputs[1]->attr.size,
+                                       inputs[1]->attr.dim_num)) {
+        return NULL;
+    }
+
+    final_tensors[0] = inputs[0];
+
+    if (inputs[1]->attr.dim_num >= 3) {
+
+        final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
+        final_shape[1] = inputs[1]->attr.size[2];
+        final_shape[2] = 1;
+        final_shape[3] = inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
+        final_in1_rank =
+            inputs[1]->attr.dim_num == 3 ? 2 : inputs[1]->attr.dim_num;
+        if (!vsi_nn_kernel_gpu_check_shape(final_shape, final_in1_rank)) {
+            return NULL;
+        }
+
+        rs_tensors = vsi_nn_reshape_tensor(graph, inputs[1], final_shape, final_in1_rank);
+        final_tensors[1] = rs_tensors;
+    } else {
+        final_tensors[1] = inputs[1];
+    }
+    final_tensors[2] = outputs[0];
+
+    in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    if (U8 == in0_dtype) {
+        pad_val = inputs[0]->attr.dtype.zero_point;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM,
+                    final_tensors, input_num, &final_tensors[2], output_num );
+            node_params[SCALAR_ALIGN_CORNERS] =
+                vsi_nn_kernel_scalar_create(graph, I32, &align_corners);
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM );
+            VSI_ASSERT(status == VSI_SUCCESS);
+            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ALIGN_CORNERS]);
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U32 = pad_val;
+                status = vxSetNodeAttribute(
+                    (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
+                CHECK_STATUS(status);
+            }
+        }
+    }
+
+    vsi_safe_release_tensor(rs_tensors);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( bilinear_grid_sample, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
index 1140cd5..87784bf 100644
--- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
@@ -35,6 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
 
@@ -579,15 +580,32 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     float   min_value  = vsi_nn_kernel_param_get_float32( params, "min_value" );
     float   max_value  = vsi_nn_kernel_param_get_float32( params, "max_value" );
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;
 
-    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
-                inputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_element_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
+
+    if ( ret )
     {
         return NULL;
     }
 
-    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
-    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shape, new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shape, new_rank );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
+                reshape_tensors[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1);
+    status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[1], image_2d );
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -595,7 +613,7 @@ static vsi_nn_kernel_node_t _setup
         {
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _CLIP_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    reshape_tensors, input_num, &reshape_tensors[1], output_num );
             node_params[SCALAR_MIN_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &min_value );
             node_params[SCALAR_MAX_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &max_value );
             /* Pass parameters to node. */
@@ -605,6 +623,10 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_MAX_VALUE] );
         }
     }
+
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
index dbbe2ad..2fb8330 100644
--- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
 
 __BEGIN_DECLS
 
@@ -520,30 +521,91 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
-    int32_t operation = 0;
+    int32_t operation = vsi_nn_kernel_param_get_int32( params, "operation" );
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = FALSE;
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if ( ret )
     {
-        return NULL;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                inputs[1], shapes[1], new_rank );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[2], new_rank );
+
+#define _swap_tensor(a, b, tmp)  \
+    do { \
+        tmp = a; \
+        a = b; \
+        b = tmp; \
+    } while(0)
+
+        if (shapes[1][3] > shapes[0][3] && new_rank == 4)
+        {
+            vsi_nn_tensor_t* reshape_tmp;
+            _swap_tensor(reshape_tensors[0], reshape_tensors[1], reshape_tmp);
+
+            if (VSI_NN_RELATIONAL_OPS_GREAT == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_LESS;
+            }
+            else if (VSI_NN_RELATIONAL_OPS_LESS == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_GREAT;
+            }
+            else if (VSI_NN_RELATIONAL_OPS_GREAT_EQUAL == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_LESS_EQUAL;
+            }
+            else if (VSI_NN_RELATIONAL_OPS_LESS_EQUAL == operation)
+            {
+                operation = VSI_NN_RELATIONAL_OPS_GREAT_EQUAL;
+            }
+        }
+
+#undef _swap_tensor
+    }
+    else
+    {
+        goto final;
     }
 
-    operation = vsi_nn_kernel_param_get_int32( params, "operation" );
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
+    }
 
-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( inputs, outputs, operation, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2 || reshape_tensors[2]->attr.size[2] == 1);
+
+    status = _query_kernel( reshape_tensors, &reshape_tensors[2], operation, image_2d, kernel );
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             /* Pass parameters to node. */
             vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM,
-                    inputs, 2, outputs, 1 );
+                    reshape_tensors, 2, &reshape_tensors[2], 1 );
 
             status = vsi_nn_kernel_node_pass_param( node, node_params, _EVIS_PARAM_NUM );
         }
     }
+
+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[2] );
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
index dbbfc6e..23b1433 100644
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@@ -56,6 +56,10 @@ typedef enum
     UNARY_RCP,
     UNARY_SIGN,
     UNARY_SOFTSIGN,
+    UNARY_ATAN,
+    UNARY_ATANH,
+    UNARY_ACOSH,
+    UNARY_INVERSE_SIGMOID,
 } unary_type_e;
 
 /*
@@ -100,6 +104,10 @@ typedef enum
 #define RCP_OPERATION           rcp
 #define SIGN_OPERATION          sign
 #define SOFTSIGN_OPERATION      softsign
+#define ATAN_OPERATION          atan
+#define ATANH_OPERATION         atanh
+#define ACOSH_OPERATION         acosh
+#define INVERSE_SIGMOID_OPERATION inverse_sigmoid
 
 #define ADD_UNARY_SH_KERNELS(name, source) \
     TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \
@@ -142,12 +150,16 @@ static const struct {
     ADD_UNARY_SH_KERNELS(RCP,       KERNEL_SOURCE1)
     ADD_UNARY_SH_KERNELS(SIGN,      KERNEL_SOURCE1)
     ADD_UNARY_SH_KERNELS(SOFTSIGN,  KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(ATAN,      KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(ATANH,     KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(ACOSH,     KERNEL_SOURCE1)
 
     ADD_UNARY_SH_KERNELS(HSIGMOID,  KERNEL_SOURCE0)
     ADD_UNARY_SH_KERNELS(MISH,      KERNEL_SOURCE0)
     ADD_UNARY_SH_KERNELS(ROUND,     KERNEL_SOURCE0)
     ADD_UNARY_SH_KERNELS(GELU,      KERNEL_SOURCE0)
     ADD_UNARY_SH_KERNELS(HGELU,     KERNEL_SOURCE0)
+    ADD_UNARY_SH_KERNELS(INVERSE_SIGMOID, KERNEL_SOURCE0)
 };
 
 #undef SIN_OPERATION
@@ -299,6 +311,10 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
         case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_SOFTSIGN, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_ATAN, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_ATANH, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_ACOSH, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_INVERSE_SIGMOID, BF16, BF16 ):
         {
             gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                 0x11111111, // TCfg
@@ -608,5 +624,9 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sign, UNARY_SIGN )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( softsign, UNARY_SOFTSIGN )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atan, UNARY_ATAN )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atanh, UNARY_ATANH )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( acosh, UNARY_ACOSH )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
index 05362bb..355e908 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@@ -47,6 +47,8 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_4    "gather_nd_mix"
 #define KERNEL_SOURCE_5    "gather_nd_2d_mix"
 #define KERNEL_SOURCE_6    "gather_nd_3d_mix"
+#define KERNEL_SOURCE_7    "gather_nd_batch"
+#define KERNEL_SOURCE_8    "gather_nd_batch_2d"
 
  typedef enum
 {
@@ -56,8 +58,8 @@ __BEGIN_DECLS
     _3D
 } vsi_nn_kernel_coord_type_e;
 
-#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _quant_type) \
-    ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_quant_type))
+#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim) \
+    ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim))
 
 #define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
     CVIVANTE_NAMESPACE("evis.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
@@ -67,6 +69,14 @@ __BEGIN_DECLS
         HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
         SOURCE },
 
+#define HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
+    CVIVANTE_NAMESPACE("evis.gather_nd_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
+
+#define TENSOR_GATHER_ND_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1), \
+        HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -106,6 +116,15 @@ static const struct {
     TENSOR_GATHER_ND_KERNELS(F16, I32, I16, _3D,      KERNEL_SOURCE_6)
     TENSOR_GATHER_ND_KERNELS(U8,  I32, F16, _3D,      KERNEL_SOURCE_6)
     TENSOR_GATHER_ND_KERNELS(F16, I32, U8,  _3D,      KERNEL_SOURCE_6)
+
+    TENSOR_GATHER_ND_BATCH_KERNELS(I8,  I32, I8,  _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_BATCH_KERNELS(U8,  I32, U8,  _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_BATCH_KERNELS(I16, I32, I16, _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_BATCH_KERNELS(I8,  I32, I8,  _2D,      KERNEL_SOURCE_8)
+    TENSOR_GATHER_ND_BATCH_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_8)
+    TENSOR_GATHER_ND_BATCH_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_8)
+    TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_8)
 };
 
 /*
@@ -128,7 +147,8 @@ static vsi_status get_gather_nd_tensor_reshape_size
     vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
     vsi_size_t block_size,
     uint32_t coordDim,
-    int32_t* newDim
+    int32_t* newDim,
+    int32_t  batch_dims
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -139,45 +159,63 @@ static vsi_status get_gather_nd_tensor_reshape_size
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
     {
         elementCnt *= input_size[i];
     }
 
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
     {
         sizes[i] = 1;
     }
 
-    if(coordDim) // input reshape
+    if (coordDim) // input reshape
     {
-        uint32_t offset = dims_num - coordDim + 1;
-        for(i = coordDim-1; i > 0; i--)
-        {
-            sizes[i] = input_size[i + offset - 1];
-        }
-        for(i = 0; i < offset; i++)
-        {
-            sizes[0] *= input_size[i];
-        }
+        uint32_t offset = dims_num - coordDim + 1 - batch_dims;
 
-        newDim[0] = coordDim;
-        if(coordDim == 1)
+        if (batch_dims)
         {
-            newDim[0] = 2;
-            sizes[0] = block_size;
-            sizes[1] = elementCnt / block_size;
+            for (i = 0; i < offset; i++)
+            {
+                sizes[0] *= input_size[i];
+            }
+
+            for (i = 0; i < coordDim; i++)
+            {
+                sizes[i + 1] = input_size[i + offset];
+            }
+
+            newDim[0] = coordDim == 1 ? 2 : 3;
         }
-        else if(coordDim == 4)
+        else
         {
-            newDim[0] = 3;
+            for (i = coordDim-1; i > 0; i--)
+            {
+                sizes[i] = input_size[i + offset - 1];
+            }
+            for (i = 0; i < offset; i++)
+            {
+                sizes[0] *= input_size[i];
+            }
+
+            newDim[0] = coordDim;
+            if (coordDim == 1)
+            {
+                newDim[0] = 2;
+                sizes[0] = block_size;
+                sizes[1] = elementCnt / block_size;
+            }
+            else if (coordDim == 4)
+            {
+                newDim[0] = 3;
+            }
         }
 
         status = VSI_SUCCESS;
     }
     else  // indices&output reshape
     {
-        if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
         {
             sizes[0] = block_size;
             sizes[1] = elementCnt / block_size;
@@ -229,7 +267,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -246,7 +284,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
         src0ZP = attr[0]->asymm.zero_point;
     }
 
-    if( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         if (attr[2]->dfp.fl > 0)
         {
@@ -375,7 +413,8 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t* const* const inputs,
     vsi_nn_tensor_t* const* const outputs,
     vsi_nn_kernel_t* kernel,
-    int32_t coord_dim
+    int32_t coord_dim,
+    int32_t batch_dims
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -396,29 +435,29 @@ static vsi_status _query_kernel
         output_dtype = F16;
     }
 
-    if(coord_dim == 1)
+    if (coord_dim == 1)
     {
         coord_type = _1D;
     }
-    else if(coord_dim == 2)
+    else if (coord_dim == 2)
     {
         coord_type = _2D;
     }
-    else if(coord_dim == 3 || coord_dim == 4)
+    else if (coord_dim == 3 || coord_dim == 4)
     {
         coord_type = _3D;
     }
 
-    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, 0 );
+    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_dims );
 
-    for( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
     {
-        if( gather_nd_map[i].key == key )
+        if ( gather_nd_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(gather_nd_map) )
+    if ( i < _cnt_of_array(gather_nd_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  gather_nd_map[i].function_name );
         kernel->info.parameters = _gather_nd_kernel_param_def;
@@ -451,29 +490,30 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t tmp_params[_GATHER_ND_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
     vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
     int32_t coord_dim   = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
 
-    status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim);
-    status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim);
-    status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim);
-    if(status != VSI_SUCCESS)
+    status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
+    status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
+    status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
+    if (status != VSI_SUCCESS)
     {
         return NULL;
     }
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    status = _query_kernel( inputs, outputs, kernel, coord_dim );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims );
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 0;
             /* Pass parameters to node. */
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
index be2db5e..40e22e9 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
@@ -154,11 +154,11 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
 
     if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
     {
-        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
-        if (srcFixPointPos >= 0)
-            output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos);
-        else if (srcFixPointPos < 0)
-            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - srcFixPointPos);
+        int8_t dstFixPointPos = (int8_t)output_attr[0]->dfp.fl;
+        if (dstFixPointPos >= 0)
+            output_scale *= (vx_float32)((int64_t)1 << dstFixPointPos);
+        else if (dstFixPointPos < 0)
+            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - dstFixPointPos);
     }
     else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
index af31e07..48af7f8 100644
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@@ -838,6 +838,16 @@ static vsi_nn_kernel_node_t _setup
         new_shape[3] = batch;
         rank = 4;
     }
+#define LOCAL_GROUP_PIXEL_SIZE_U8  (256)
+    else if (new_shape[0] < LOCAL_GROUP_PIXEL_SIZE_U8 &&
+            (new_shape[0] * new_shape[1]) % LOCAL_GROUP_PIXEL_SIZE_U8 == 0)
+    {
+        if (vsi_nn_TypeGetBits(outputs[i]->attr.dtype.vx_type) == 8)
+        {
+            new_shape[0] = LOCAL_GROUP_PIXEL_SIZE_U8;
+            new_shape[1] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1] / LOCAL_GROUP_PIXEL_SIZE_U8;
+        }
+    }
 
     reshape_tensor[0] = vsi_nn_reshape_tensor( graph,
             inputs[0], new_shape, rank );
@@ -922,6 +932,21 @@ static vsi_nn_kernel_node_t _setup
         goto final;
     }
 
+    /* a = input_scale * output_scale * alpha * mean
+      b = (beta - scale * mean) * output_scale + output_zp - input * alpha  */
+    status = _query_kernel( ikernels[MEANS_INDEX], inputs, outputs, reshape_flg, INTERNAL_KERNEL_MEANS );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    /* dst = x * a + b  */
+    status = _query_kernel( kernel, inputs, outputs, reshape_flg, INTERNAL_KERNEL_NORMS );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
     sums_node = vsi_nn_kernel_create_node( graph, ikernels[SUMS_INDEX] );
     if (sums_node)
     {
@@ -952,14 +977,6 @@ static vsi_nn_kernel_node_t _setup
         }
     }
 
-    /* a = input_scale * output_scale * alpha * mean
-      b = (beta - scale * mean) * output_scale + output_zp - input * alpha  */
-    status = _query_kernel( ikernels[MEANS_INDEX], inputs, outputs, reshape_flg, INTERNAL_KERNEL_MEANS );
-    if ( VSI_SUCCESS != status )
-    {
-        goto final;
-    }
-
     means_node = vsi_nn_kernel_create_node( graph, ikernels[MEANS_INDEX] );
     if (means_node)
     {
@@ -988,12 +1005,6 @@ static vsi_nn_kernel_node_t _setup
         vsi_nn_kernel_scalar_release( &means_node_params[MEANS_GROUP_NUM_SCL] );
     }
 
-    /* dst = x * a + b  */
-    status = _query_kernel( kernel, inputs, outputs, reshape_flg, INTERNAL_KERNEL_NORMS );
-    if ( VSI_SUCCESS != status )
-    {
-        goto final;
-    }
     norms_node = vsi_nn_kernel_create_node( graph, kernel );
     if (norms_node)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c b/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c
new file mode 100644
index 0000000..00c31c3
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/l1norm_evis.c
@@ -0,0 +1,375 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+
+#define _L1NORM_KERNEL_SOURCE_NAME(AXIS)      "l1norm_axis"#AXIS
+
+// Add kernel hashtable here
+#define L1NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d, AXIS) \
+        (( IN_DTYPE << 24 ) | ( OUT_DTYPE << 16) | (_image_2d << 8) | (AXIS))
+#define L1NORM_KERNELS( IN_DTYPE, OUT_DTYPE, AXIS ) \
+        { L1NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 , AXIS), \
+        CVIVANTE_NAMESPACE("evis.l1norm_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        _L1NORM_KERNEL_SOURCE_NAME(AXIS) }
+
+#define L1NORM_KERNELS_2D( IN_DTYPE, OUT_DTYPE, AXIS ) \
+        { L1NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, AXIS), \
+        CVIVANTE_NAMESPACE("evis.l1norm_"#IN_DTYPE"to"#OUT_DTYPE"_2D_axis"#AXIS), \
+        _L1NORM_KERNEL_SOURCE_NAME(AXIS) }
+
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _l1norm_kernel_map[] =
+{
+    // Register kernel here
+    L1NORM_KERNELS( U8,  U8,  0 ),
+    L1NORM_KERNELS( U8,  F16, 0 ),
+    L1NORM_KERNELS( I8,  I8,  0 ),
+    L1NORM_KERNELS( I8,  F16, 0 ),
+    L1NORM_KERNELS( I16, I16, 0 ),
+    L1NORM_KERNELS( I16, F16, 0 ),
+    L1NORM_KERNELS( F16, F16, 0 ),
+    L1NORM_KERNELS( F16, U8,  0 ),
+    L1NORM_KERNELS( F16, I8,  0 ),
+    L1NORM_KERNELS( F16, I16, 0 ),
+
+    L1NORM_KERNELS( U8,  U8,  1 ),
+    L1NORM_KERNELS( U8,  F16, 1 ),
+    L1NORM_KERNELS( I8,  I8,  1 ),
+    L1NORM_KERNELS( I8,  F16, 1 ),
+    L1NORM_KERNELS( I16, I16, 1 ),
+    L1NORM_KERNELS( I16, F16, 1 ),
+    L1NORM_KERNELS( F16, F16, 1 ),
+    L1NORM_KERNELS( F16, U8,  1 ),
+    L1NORM_KERNELS( F16, I8,  1 ),
+    L1NORM_KERNELS( F16, I16, 1 ),
+
+    L1NORM_KERNELS( U8,  U8,  2 ),
+    L1NORM_KERNELS( U8,  F16, 2 ),
+    L1NORM_KERNELS( I8,  I8,  2 ),
+    L1NORM_KERNELS( I8,  F16, 2 ),
+    L1NORM_KERNELS( I16, I16, 2 ),
+    L1NORM_KERNELS( I16, F16, 2 ),
+    L1NORM_KERNELS( F16, F16, 2 ),
+    L1NORM_KERNELS( F16, U8,  2 ),
+    L1NORM_KERNELS( F16, I8,  2 ),
+    L1NORM_KERNELS( F16, I16, 2 ),
+
+    L1NORM_KERNELS_2D( U8,  U8,  0 ),
+    L1NORM_KERNELS_2D( U8,  F16, 0 ),
+    L1NORM_KERNELS_2D( I8,  I8,  0 ),
+    L1NORM_KERNELS_2D( I8,  F16, 0 ),
+    L1NORM_KERNELS_2D( I16, I16, 0 ),
+    L1NORM_KERNELS_2D( I16, F16, 0 ),
+    L1NORM_KERNELS_2D( F16, F16, 0 ),
+    L1NORM_KERNELS_2D( F16, U8,  0 ),
+    L1NORM_KERNELS_2D( F16, I8,  0 ),
+    L1NORM_KERNELS_2D( F16, I16, 0 ),
+
+    L1NORM_KERNELS_2D( U8,  U8,  1 ),
+    L1NORM_KERNELS_2D( U8,  F16, 1 ),
+    L1NORM_KERNELS_2D( I8,  I8,  1 ),
+    L1NORM_KERNELS_2D( I8,  F16, 1 ),
+    L1NORM_KERNELS_2D( I16, I16, 1 ),
+    L1NORM_KERNELS_2D( I16, F16, 1 ),
+    L1NORM_KERNELS_2D( F16, F16, 1 ),
+    L1NORM_KERNELS_2D( F16, U8,  1 ),
+    L1NORM_KERNELS_2D( F16, I8,  1 ),
+    L1NORM_KERNELS_2D( F16, I16, 1 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _l1norm_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
+
+    // Add kererl parameters here
+};
+#define _L1NORM_PARAM_NUM  _cnt_of_array( _l1norm_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_l1norm_initializer_axis)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status    = VSI_FAILURE;
+    vx_tensor  output    = (vx_tensor)param[1];
+    vx_int32   axis      = 0;
+    vx_int32   dim       = 0;
+    vx_int32   width     = 0;
+    vx_int32   height    = 0;
+    vx_int32   depth     = 0;
+
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
+
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+    dim       = output_shape->size < 3 ? 2 : 3;
+    width     = (vx_int32)output_shape->data[0];
+    height    = (vx_int32)output_shape->data[1];
+    depth     = dim < 3 ? 1 : (vx_int32)output_shape->data[2];
+
+    gpu_param.dim = 2;
+    if (axis == 0)
+    {
+         gpu_param.global_scale[0]  = 1;
+    }
+    else
+    {
+         gpu_param.global_scale[0]  = 8;
+    }
+    gpu_param.global_scale[1]  = 1;
+
+    if (axis == 0)
+    {
+        gpu_param.global_size[0] = height;
+        gpu_param.global_size[1] = depth;
+    }
+    else if (axis == 1)
+    {
+        gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];;
+        gpu_param.global_size[1] = depth;
+    }
+    else if (axis == 2)
+    {
+        gpu_param.global_size[0] = (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];;
+        gpu_param.global_size[1] = height;
+    }
+
+    {
+        gpu_dp_inst_t ExtractBin_part0_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t ExtractBin_part1_4x4= {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtract8Bin_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        status  = vsi_nn_kernel_gpu_add_param( node,"uniExtract8Bin_2x8", &uniExtract8Bin_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,"ExtractBin_part0_4x4", &ExtractBin_part0_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,"ExtractBin_part1_4x4", &ExtractBin_part1_4x4 );
+    }
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _l1norm_initializer_axis() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d,
+    int32_t axis
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _l1norm_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _l1norm_kernel_map );
+    vx_param_description_t * param_def  = _l1norm_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _l1norm_initializer_axis;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = L1NORM_HASH_KEY( in_dtype, out_dtype, image_2d, axis);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _l1norm_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_L1NORM_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d  = FALSE;
+    int32_t axis       = vsi_nn_kernel_param_get_int32(params, "axis");
+    float outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float inputZp      = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    int32_t axisSize   = (int32_t)outputs[0]->attr.size[axis];
+
+    outputScale = 1.0f / outputScale;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    image_2d = (outputs[0]->attr.dim_num == 2);
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d, axis );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 2;
+
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+            border.constant_value.S32 = (int32_t)inputZp;
+            status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+
+            vsi_nn_kernel_node_pack_io( node_params, _L1NORM_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputZp );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axisSize );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _L1NORM_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( l1norm, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
index e525f5e..966a6cd 100644
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@@ -70,12 +70,12 @@ __BEGIN_DECLS
         HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, SCALE_TYPE, OUT_TYPE), \
         SOURCE },
 
-#define TENSOR_LAYERNORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+#define TENSOR_LAYERNORM_SCALE_KERNELS(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \
     { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, 0), \
         HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
-#define TENSOR_LAYERNORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+#define TENSOR_LAYERNORM_SCALE_KERNELS_2D(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \
     { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, 1), \
         HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
@@ -152,8 +152,8 @@ static const _kernel_map_type _layernorm_kernel_map[] =
     LAYERNORM_KERNELS_3D( I8,  F32, F16, SOURCE_AXIS0_2 )
     LAYERNORM_KERNELS_2D( I8,  F32, F16, SOURCE_AXIS0_2 )
 
-    LAYERNORM_KERNELS_3D( BF16,  F32, BF16, SOURCE_AXIS0_3 )
-    LAYERNORM_KERNELS_2D( BF16,  F32, BF16, SOURCE_AXIS0_3 )
+    TENSOR_LAYERNORM_SCALE_KERNELS( BF16,  F32, BF16, SOURCE_AXIS0_3 )
+    TENSOR_LAYERNORM_SCALE_KERNELS_2D( BF16,  F32, BF16, SOURCE_AXIS0_3 )
 };
 
 static const _kernel_map_type _layernorm_axis01_kernel_map[] =
diff --git a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c
index 69c0434..890f7bc 100644
--- a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
 
@@ -221,28 +221,52 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_LOGICAL_NOT_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = FALSE;
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_element_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            shape, &new_rank );
+
+    if ( ret )
     {
-        return NULL;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shape, new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shape, new_rank );
+    }
+    else
+    {
+        goto final;
+    }
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size,
+                reshape_tensors[1]->attr.dim_num ) )
+    {
+        goto final;
     }
 
     image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1);
-    status = _query_kernel( kernel, inputs, outputs, image_2d);
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, &reshape_tensors[0], &reshape_tensors[1], image_2d);
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_NOT_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    &reshape_tensors[0], input_num, &reshape_tensors[1], output_num );
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_NOT_PARAM_NUM );
         }
     }
 
+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
index 6a323c0..7e5476b 100644
--- a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
 
 __BEGIN_DECLS
 
@@ -286,30 +286,75 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_LOGICAL_OPS_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = FALSE;
     uint32_t ops_type  = vsi_nn_kernel_param_get_int32( params, "ops_type" );
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if ( ret )
     {
-        return NULL;
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                inputs[1], shapes[1], new_rank );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[2], new_rank );
+
+#define _swap_tensor(a, b, tmp)  \
+    do { \
+        tmp = a; \
+        a = b; \
+        b = tmp; \
+    } while(0)
+
+        if (shapes[1][3] > shapes[0][3] && new_rank == 4)
+        {
+            vsi_nn_tensor_t* reshape_tmp;
+            _swap_tensor(reshape_tensors[0], reshape_tensors[1], reshape_tmp);
+        }
+
+#undef _swap_tensor
+    }
+    else
+    {
+        goto final;
     }
 
-    image_2d = (outputs[0]->attr.dim_num == 2);
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
+    }
 
-    status = _query_kernel( kernel, inputs, outputs, image_2d, (vsi_nn_logical_ops_type_t)ops_type);
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2 || reshape_tensors[2]->attr.size[2] == 1);
 
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( kernel, reshape_tensors, &reshape_tensors[2],
+                                image_2d, (vsi_nn_logical_ops_type_t)ops_type);
+
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             /* Pass parameters to node. */
             vsi_nn_kernel_node_pack_io( node_params, _LOGICAL_OPS_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
+                    reshape_tensors, input_num, &reshape_tensors[2], output_num );
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _LOGICAL_OPS_PARAM_NUM );
         }
     }
 
+final:
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[2] );
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
index f94f94b..a99acc6 100644
--- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
@@ -992,7 +992,7 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
     int32_t                      _is_cifg               = 0;
     int32_t                      _is_hybrid             = 0;
     vsi_nn_kernel_tensor_attr_t* input_attr[9]          = {NULL};
-    vsi_nn_kernel_tensor_attr_t* attr[2]                = {NULL};;
+    vsi_nn_kernel_tensor_attr_t* attr[2]                = {NULL};
 
     status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln );
     CHECK_STATUS_FAIL_GOTO(status, final );
diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
index 8157779..6e4ee41 100644
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@@ -57,6 +57,7 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_13   "matrixmul_i16"
 #define KERNEL_SOURCE_14   "matrixmul_f16i16_i16"
 #define KERNEL_SOURCE_15   "matrixmul_bf16"
+#define KERNEL_SOURCE_16   "matrixmul_u8i16_i16"
 
 #define HASH_MATRIX_MUL_KEY(_input0_type, _input1_type, _output_type, _trans_a, _trans_b) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_trans_a << 4) | (_trans_b))
@@ -116,6 +117,7 @@ static const struct {
     TENSOR_MATRIX_MUL_KERNELS(F16, F16, I8,       KERNEL_SOURCE_11)
     TENSOR_MATRIX_MUL_KERNELS(F16, F16, I16,      KERNEL_SOURCE_11)
     TENSOR_MATRIX_MUL_KERNELS(F32, F32, F32,      KERNEL_SOURCE_2)
+    TENSOR_MATRIX_MUL_KERNELS(U8,  I16, I16,      KERNEL_SOURCE_16)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, F16, F16,    KERNEL_SOURCE_3)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8,  F16,    KERNEL_SOURCE_4)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8,  U8,     KERNEL_SOURCE_4)
@@ -123,6 +125,7 @@ static const struct {
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8,  U8,  U8,     KERNEL_SOURCE_5)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(I16, I16, I16,    KERNEL_SOURCE_13)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(BF16,BF16,BF16,   KERNEL_SOURCE_15)
+    TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8,  I16, I16,    KERNEL_SOURCE_16)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8,  U8,  U8,     KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8,  I8,  I8,     KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, I16, I16,    KERNEL_SOURCE_7)
@@ -131,6 +134,7 @@ static const struct {
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, F16, I16,    KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(F16, F16, F16,    KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(BF16,BF16,BF16,   KERNEL_SOURCE_15)
+    TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8,  I16, I16,    KERNEL_SOURCE_7)
 };
 
 /*
@@ -883,6 +887,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
         case _PACK_SELECT_KEY( U8,  F16,  U8, 1, 0, 0 ):
         case _PACK_SELECT_KEY( I8,  F16,  I8, 1, 0, 0 ):
         case _PACK_SELECT_KEY( I16, F16, I16, 1, 0, 0 ):
+        case _PACK_SELECT_KEY( U8,  I16, I16, 1, 0, 0 ):
         case _PACK_SELECT_KEY( U8,  U8,  U8,  1, 0, 1 ):
         case _PACK_SELECT_KEY( I8,  I8,  I8,  1, 0, 1 ):
         case _PACK_SELECT_KEY( I16, I16, I16, 1, 0, 1 ):
@@ -890,6 +895,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
         case _PACK_SELECT_KEY( U8,  F16,  U8, 1, 0, 1 ):
         case _PACK_SELECT_KEY( I8,  F16,  I8, 1, 0, 1 ):
         case _PACK_SELECT_KEY( I16, F16, I16, 1, 0, 1 ):
+        case _PACK_SELECT_KEY( U8,  I16, I16, 1, 0, 1 ):
             {
                 status = vsi_nn_kernel_gpu_add_param( node,
                         "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 );
@@ -1041,6 +1047,24 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
+        case _PACK_SELECT_KEY( U8, I16, I16, 0, 0, 0 ):
+        case _PACK_SELECT_KEY( U8, I16, I16, 0, 0, 1 ):
+        case _PACK_SELECT_KEY( U8, I16, I16, 0, 1, 0 ):
+        case _PACK_SELECT_KEY( U8, I16, I16, 0, 1, 1 ):
+            {
+                status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP );
+                status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP );
+                status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut );
+                status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP );
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
         default:
             break;
         }
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
index 2c529ce..fe39a5c 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
@@ -111,6 +111,7 @@ static vx_param_description_t vxPreProcessNv12Kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _EVIS_PRE_PROCESS_NV12_PARAM_NUM          _cnt_of_array(vxPreProcessNv12Kernel_param_def)
 
@@ -631,6 +632,7 @@ static vsi_nn_kernel_node_t _setup
             float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
             float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
             int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
+            int32_t nv_type  = vsi_nn_kernel_param_get_int32( params, "nv_type" );
 
             /* Pass parameters to node. */
             vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM,
@@ -646,6 +648,7 @@ static vsi_nn_kernel_node_t _setup
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &nv_type );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_PARAM_NUM );
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &tmp_params[3] );
@@ -658,6 +661,7 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[10] );
             vsi_nn_kernel_scalar_release( &tmp_params[11] );
             vsi_nn_kernel_scalar_release( &tmp_params[12] );
+            vsi_nn_kernel_scalar_release( &tmp_params[13] );
         }
     }
     vsi_safe_release_tensor(reshape_tensors[0]);
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
index ca76dfe..ca397de 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
@@ -195,8 +195,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
                 0x0a0a0a0a, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00000100, // AccumType, ConstantType, and PostShift
-                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
-                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
+                0x40083ca7, 0x00000000, 0x40083ca7, 0x00000000,
+                0x40083ca7, 0x00000000, 0x40083ca7, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
         gpu_dp_inst_t uniConvertYUV422toG_4x4 = {{
                 0x29292929, // TCfg
@@ -205,8 +205,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
                 0x2a2a2a2a, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00000100, // AccumType, ConstantType, and PostShift
-                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
-                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
+                0x36453ca7, 0x00003a81, 0x36453ca7, 0x00003a81,
+                0x36453ca7, 0x00003a81, 0x36453ca7, 0x00003a81 // Constant
         }, GPU_DP_TYPE_16 };
         gpu_dp_inst_t uniConvertYUV422toR_4x4 = {{
                 0x05050505, // TCfg
@@ -215,18 +215,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
                 0x0a0a0a0a, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00000100, // AccumType, ConstantType, and PostShift
-                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
-                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
+                0x3e623ca7, 0x00000000, 0x3e623ca7, 0x00000000,
+                0x3e623ca7, 0x00000000, 0x3e623ca7, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
-                0x91919191, // TCfg
-                0x40404040, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0xa2a2a2a2, // BSelt
+        gpu_dp_inst_t uniExtractYUVtoShortSub_2x8 = {{
+                0x99999999, // TCfg
+                0x44444444, // ASelt
+                0x03120110, 0x07160514, // ABin
+                0xaaaaaaaa, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00000700, // AccumType, ConstantType, and PostShift
-                0x00000001, 0x00010001, 0x00000001, 0x00010001,
-                0x00000001, 0x00010001, 0x00000001, 0x00010001 // Constant
+                0x00010001, 0x00010001, 0x00010001, 0x00010001,
+                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
         }, GPU_DP_TYPE_16 };
         gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
             0x11111111, // TCfg
@@ -244,7 +244,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
         status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYUVtoShortSub_2x8", &uniExtractYUVtoShortSub_2x8);
         status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
         status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
@@ -386,8 +386,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
                 0x0a0a0a0a, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00000100, // AccumType, ConstantType, and PostShift
-                0x3f323c00, 0x00000000, 0x3f323c00, 0x00000000,
-                0x3f323c00, 0x00000000, 0x3f323c00, 0x00000000 // Constant
+                0x40083ca7, 0x00000000, 0x40083ca7, 0x00000000,
+                0x40083ca7, 0x00000000, 0x40083ca7, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
         gpu_dp_inst_t uniConvertYUV422toG_4x4 = {{
                 0x29292929, // TCfg
@@ -396,18 +396,18 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
                 0x2a2a2a2a, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00000100, // AccumType, ConstantType, and PostShift
-                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
-                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
+                0x36453ca7, 0x00003a81, 0x36453ca7, 0x00003a81,
+                0x36453ca7, 0x00003a81, 0x36453ca7, 0x00003a81 // Constant
         }, GPU_DP_TYPE_16 };
-        gpu_dp_inst_t uniConvertYUV422toR_4x4 = {{
+        gpu_dp_inst_t uniConvertYUV422toR_4x4 = { {
                 0x05050505, // TCfg
                 0x04040404, // ASelt
                 0x00510040, 0x00730062, // ABin
                 0x0a0a0a0a, // BSelt
                 0x00000000, 0x00000000, // BBin
                 0x00000100, // AccumType, ConstantType, and PostShift
-                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
-                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
+                0x3e623ca7, 0x00000000, 0x3e623ca7, 0x00000000,
+                0x3e623ca7, 0x00000000, 0x3e623ca7, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
         gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
                 0x99999999, // TCfg
@@ -419,6 +419,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
                 0x00010001, 0x00010001, 0x00010001, 0x00010001,
                 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
         }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractYtoShortSub16_4x4 = {{
+                0x09090909, // TCfg
+                0x04040404, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00000000, 0x00010001, 0x00000000,
+                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        },  GPU_DP_TYPE_16 };
         gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
                 0x11111111, // TCfg
                 0x11110000, // ASelt
@@ -440,6 +450,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_4x4", &uniExtractYtoShortSub16_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
         status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
index 2ccc607..9876ebc 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@@ -113,7 +113,7 @@ static vsi_status get_scatter_nd_tensor_reshape_size
     uint32_t i = 0;
     vsi_size_t elementCnt = 1;
 
-    if(coordDim != 0 && (width == NULL || area == NULL))
+    if (coordDim != 0 && (width == NULL || area == NULL))
     {
         return status;
     }
@@ -121,12 +121,12 @@ static vsi_status get_scatter_nd_tensor_reshape_size
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
     {
         elementCnt *= input_size[i];
     }
 
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
     {
         sizes[i] = 1;
     }
@@ -135,22 +135,22 @@ static vsi_status get_scatter_nd_tensor_reshape_size
     sizes[1] = elementCnt / block_size;
     newDim[0] = 2;
 
-    if((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH)
+    if ((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH)
     {
         isBig[0] |= 1;
     }
 
-    if(coordDim == 1) // index shape
+    if (coordDim == 1) // index shape
     {
         *width = 0;
         *area = 0;
     }
-    else if(coordDim == 2)
+    else if (coordDim == 2)
     {
         *width = input_size[dims_num - 2];
         *area = 0;
     }
-    else if(coordDim == 3)
+    else if (coordDim == 3)
     {
         *width = input_size[dims_num - 3];
         *area = input_size[dims_num - 3] * input_size[dims_num - 2];
@@ -211,19 +211,19 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
         output_zp = attr[2]->asymm.zero_point;
     }
 
-    if(coord_dim == 3)
+    if (coord_dim == 3)
     {
         offsetX = area;
         offsetY = width;
         offsetZ = 1;
     }
-    else if(coord_dim == 2)
+    else if (coord_dim == 2)
     {
         offsetX = width;
         offsetY = 1;
         offsetZ = 0;
     }
-    else if(coord_dim == 1)
+    else if (coord_dim == 1)
     {
         offsetX = 1;
         offsetY = 0;
@@ -368,19 +368,19 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer)
         output_zp = attr[2]->asymm.zero_point;
     }
 
-    if(coord_dim == 3)
+    if (coord_dim == 3)
     {
         offsetX = area;
         offsetY = width;
         offsetZ = 1;
     }
-    else if(coord_dim == 2)
+    else if (coord_dim == 2)
     {
         offsetX = width;
         offsetY = 1;
         offsetZ = 0;
     }
-    else if(coord_dim == 1)
+    else if (coord_dim == 1)
     {
         offsetX = 1;
         offsetY = 0;
@@ -464,19 +464,19 @@ static vsi_status _query_kernel
 
     key = HASH_SCATTER_ND_KEY( input1_dtype, output_dtype, 0, isBig );
 
-    for( i = 0; i < _cnt_of_array(scatter_nd_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_map); i ++ )
     {
-        if( scatter_nd_map[i].key == key )
+        if ( scatter_nd_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(scatter_nd_map) )
+    if ( i < _cnt_of_array(scatter_nd_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_map[i].function_name );
         kernel->info.parameters = _scatter_nd_kernel_param_def;
         kernel->info.numParams = _cnt_of_array( _scatter_nd_kernel_param_def );
-        if(isBig)
+        if (isBig)
         {
             kernel->info.initialize = _scatter_nd_big_initializer;
         }
@@ -517,22 +517,27 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t width = 0, area = 0;
     int32_t big_flg = 0;
 
+    if (coord_dim > 3)
+    {
+        return NULL;
+    }
+
     status = get_scatter_nd_tensor_reshape_size(&inputs[0], shapes[0], coord_dim, 0,
                                                     NULL, NULL, &rs_idx_dim, &big_flg);
     status |= get_scatter_nd_tensor_reshape_size(&inputs[1], shapes[1], block_size, 0,
                                                     NULL, NULL, &rs_in_dim, &big_flg);
     status |= get_scatter_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
                                                     &width, &area, &rs_out_dim, &big_flg);
-    if(status != VSI_SUCCESS)
+    if (status != VSI_SUCCESS)
     {
         return NULL;
     }
 
     status = _query_kernel( inputs, outputs, kernel, coord_dim, big_flg);
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 0;
             /* Pass parameters to node. */
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
index 957a666..e9d6d5d 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@@ -44,6 +44,7 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_1    "scatter_nd_update"
 #define KERNEL_SOURCE_2    "scatter_nd_update_big"
 #define KERNEL_SOURCE_3    "scatter_nd_update_atom"
+#define KERNEL_SOURCE_4    "scatter_nd_update_special"
 
 #define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _pre_op, _large_type) \
     ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_pre_op << 4) | (_large_type))
@@ -60,6 +61,15 @@ __BEGIN_DECLS
  #define HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME() \
     CVIVANTE_NAMESPACE("evis.scatter_nd_update_reset")
 
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_ref2out_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(SRC2_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_update2ref_"#SRC2_TYPE"to"#DST_TYPE"_16x")
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_cpy2out_"#DST_TYPE"to"#DST_TYPE)
+
 #define TENSOR_SCATTER_ND_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
     { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0, 0), \
         HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \
@@ -80,6 +90,21 @@ __BEGIN_DECLS
         HASH_SCATTER_ND_UPDATE_SH_KERNEL_RESET_NAME(), \
         SOURCE },
 
+#define TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 3, 1), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_REF_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 4, 1), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_UPDATE_NAME(IN2_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 5, 1), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_COPY_NAME(IN0_TYPE), \
+        SOURCE },
+
 typedef struct
 {
     uint32_t key;
@@ -124,6 +149,24 @@ static const _kernel_map_type scatter_nd_update_post_map[] =
     TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(I16, I32, I16, I16,  KERNEL_SOURCE_3)
 };
 
+static const _kernel_map_type scatter_nd_update_ref_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_REF_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+};
+
+static const _kernel_map_type scatter_nd_update_update_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_UPDATE_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+};
+
+static const _kernel_map_type scatter_nd_update_copy_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(U8,  I32, U8,  U8, KERNEL_SOURCE_4)
+    TENSOR_SCATTER_ND_UPDATE_COPY_KERNELS(I8,  I32, I8,  I8, KERNEL_SOURCE_4)
+};
+
 /*
  * Kernel params
  */
@@ -178,10 +221,43 @@ static vx_param_description_t _scatter_nd_update_post_kernel_param_def[] =
     // Add kererl parameters here
 };
 
+static vx_param_description_t _scatter_nd_update_ref_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_update_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_copy_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
 #define _SCATTER_ND_UPDATE_PARAM_NUM  _cnt_of_array( _scatter_nd_update_kernel_param_def )
 #define _SCATTER_ND_UPDATE_PRE_PARAM_NUM  _cnt_of_array( _scatter_nd_update_pre_kernel_param_def )
 #define _SCATTER_ND_UPDATE_POST_PARAM_NUM  _cnt_of_array( _scatter_nd_update_post_kernel_param_def )
 #define _SCATTER_ND_UPDATE_RESET_PARAM_NUM  _cnt_of_array( _scatter_nd_update_reset_kernel_param_def )
+#define _SCATTER_ND_UPDATE_REF_PARAM_NUM  _cnt_of_array( _scatter_nd_update_ref_kernel_param_def )
+#define _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM  _cnt_of_array( _scatter_nd_update_update_kernel_param_def )
+#define _SCATTER_ND_UPDATE_COPY_PARAM_NUM  _cnt_of_array( _scatter_nd_update_copy_kernel_param_def )
 
 static vsi_status get_scatter_nd_update_tensor_reshape_size
     (
@@ -210,12 +286,12 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
 #define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
 
     newDim[0] = 0;
-    for(i = 0; i < dims_num; ++i)
+    for (i = 0; i < dims_num; ++i)
     {
         elementCnt *= input_size[i];
     }
 
-    for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
     {
         sizes[i] = 1;
     }
@@ -261,6 +337,124 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
     return VSI_SUCCESS;
 } /* _get_EltOP_tensor_reshape_size */
 
+static vsi_status check_scatter_nd_update_index_repeat
+    (
+    vsi_nn_tensor_t ** inputs,
+    int32_t coord_dim,
+    int32_t block_size,
+    int32_t indices_num,
+    int32_t* isRepeat
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    int32_t i = 0, j = 0;
+    vsi_size_t elementNum = 1;
+    vsi_nn_kernel_tensor_t ref_tensor = (vsi_nn_kernel_tensor_t)inputs[0]->t;
+    vsi_nn_kernel_tensor_t index_tensor = (vsi_nn_kernel_tensor_t)inputs[1]->t;
+    vsi_nn_kernel_tensor_attr_t* attr[2] = { NULL };
+    uint32_t*   index_buffer[1] = { NULL };
+    int32_t* mask_buffer = NULL;
+    int32_t  mask_len = 0;
+
+    if (inputs[1]->attr.is_const == FALSE)
+    {
+        isRepeat[0] = 1;
+        return VSI_SUCCESS;
+    }
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( ref_tensor );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    attr[1] = vsi_nn_kernel_tensor_attr_create( index_tensor );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    elementNum = vsi_nn_kernel_tensor_attr_get_size( attr[0] );
+    mask_len = (int32_t)elementNum / block_size;
+    mask_buffer = (int32_t*)malloc(mask_len * sizeof(int32_t));
+    CHECK_PTR_FAIL_GOTO( mask_buffer, "Create mask buffer fail.", final );
+    memset(mask_buffer, 0, mask_len * sizeof(int32_t));
+
+    index_buffer[0] = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( index_tensor, attr[1], FALSE );
+    CHECK_PTR_FAIL_GOTO( index_buffer[0], "Create index buffer fail.", final );
+
+    if (coord_dim <= 5)
+    {
+        vsi_ssize_t stride[5] = {0, 0, 0, 0, 0};
+        vsi_ssize_t new_shape[5] = {1, 1, 1, 1, 1};
+        vsi_ssize_t merge_dim = (vsi_ssize_t)attr[0]->shape->size - coord_dim + 1;
+
+        for (i = 0; i < (int32_t)merge_dim; ++i)
+        {
+            new_shape[0] *= attr[0]->shape->data[i];
+        }
+        stride[0] = new_shape[0] / block_size;
+
+        for (i = 1; i < coord_dim; ++i)
+        {
+            new_shape[i] = attr[0]->shape->data[merge_dim + i - 1];
+
+            stride[i] = stride[i - 1] * new_shape[i];
+        }
+
+        for (i = 0; i < indices_num; i++)
+        {
+            uint32_t coord[5] = {0};
+            int32_t byd_flg = 0;
+            vsi_ssize_t  mask_idx = 0;
+
+            for (j = 0; j < coord_dim; j++)
+            {
+                coord[j] = index_buffer[0][i * coord_dim + coord_dim - j - 1];
+                if (coord[j] >= (uint32_t)new_shape[j])
+                {
+                    byd_flg = 1;
+                    break;
+                }
+            }
+            if (byd_flg)
+            {
+                continue;
+            }
+
+            mask_idx = coord[4] * stride[3] + coord[3] * stride[2] +
+                            coord[2] * stride[1] + coord[1] * stride[0] + coord[0];
+            if (mask_buffer[mask_idx] == 0)
+            {
+                mask_buffer[mask_idx] = 1;
+            }
+            else if (mask_buffer[mask_idx] > 0)
+            {
+                isRepeat[0] = 1;
+                status = VSI_FAILURE;
+                CHECK_STATUS_FAIL_GOTO( status, final );
+            }
+        }
+    }
+    else
+    {
+        status = VSI_FAILURE;
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    if ( index_buffer[0] )
+    {
+        free( index_buffer[0] );
+    }
+
+    if ( mask_buffer )
+    {
+        free( mask_buffer );
+    }
+
+    for ( i = 0; i < 2; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+
+    return VSI_SUCCESS;
+} /* check_scatter_nd_update_index_repeat */
+
 /*
  * Kernel initializer
  */
@@ -1185,6 +1379,393 @@ OnError:
     return status;
 } /* _scatter_nd_update_reset_initializer() */
 
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_ref_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    int32_t block_size = 1;
+    int32_t width = 0;
+    int32_t height = 0;
+
+    int32_t input0_zp    = 0;
+    float   input0_scale = 1.0f;
+    int32_t output_zp    = 0;
+    float   output_scale = 1.0f;
+
+    uint32_t pack_key = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    block_size   = (int32_t)(attr[0]->shape->data[0]);
+    height = (int32_t)(attr[0]->shape->data[1]);
+    width = (int32_t)(block_size * height);
+    if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
+    {
+        width = (width + 7) / 8;
+    }
+    else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
+    {
+        width = (width + 15) / 16;
+    }
+
+    input0_zp     = attr[0]->asymm.zero_point;
+    input0_scale  = attr[0]->asymm.scale;
+    output_zp     = attr[1]->asymm.zero_point;
+    output_scale  = 1.0f / attr[1]->asymm.scale;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = width;
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
+        (IN0_TYPE | ( OUT_TYPE << 16))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
+
+    switch( pack_key )
+    {
+    case _PACK_SELECT_KEY( I8,  I8 ):
+    case _PACK_SELECT_KEY( U8,  U8 ):
+        {
+            uint16_t M0               = 0;
+            int32_t  postShift0       = 0;
+            uint32_t multAndoutZP0[2] = {0};
+
+            gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x1b1a1918, 0x1f1e1d1c, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            gpu_quantize_multiplier_16bit( (double)input0_scale * output_scale, &M0, &postShift0);
+
+            multAndoutZP0[0] = (uint32_t)(M0);
+            multAndoutZP0[1] = (uint32_t)((output_zp << postShift0) - input0_zp * M0);
+
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift0 );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift0 );
+
+            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift0_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift0_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
+        break;
+    default:
+        break;
+    }
+
+#undef _PACK_SELECT_KEY
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_ref_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_update_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+    int32_t     block_size = 1;
+    int32_t     update_width = 1;
+    int32_t     index_num  = 1;
+    int32_t     width = 0, area = 0, vol = 0;
+    int32_t     coord_dim  = 0;
+    int32_t     offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
+    int32_t     input1_zp    = 0;
+    float       input1_scale = 1.0f;
+    int32_t     output_zp    = 0;
+    float       output_scale = 1.0f;
+    uint32_t    pack_key = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &width);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &area);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &vol);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    block_size   = (int32_t)(attr[2]->shape->data[0]);
+    update_width = (int32_t)(attr[1]->shape->data[0]);
+    index_num    = (int32_t)(attr[0]->shape->data[1]);
+
+    input1_zp     = attr[1]->asymm.zero_point;
+    input1_scale  = attr[1]->asymm.scale;
+    output_zp     = attr[2]->asymm.zero_point;
+    output_scale  = 1.0f / attr[2]->asymm.scale;
+
+    if (coord_dim == 5)
+    {
+        offset_idx = 1;
+    }
+    if (coord_dim == 4 || coord_dim == 5)
+    {
+        offsetX = vol;
+        offsetY = area;
+        offsetZ = width;
+        offsetW = 1;
+    }
+    else if (coord_dim == 3)
+    {
+        offsetX = area;
+        offsetY = width;
+        offsetZ = 1;
+    }
+    else if (coord_dim == 2)
+    {
+        offsetX = width;
+        offsetY = 1;
+        offsetZ = 0;
+    }
+    else if (coord_dim == 1)
+    {
+        offsetX = 1;
+        offsetY = 0;
+        offsetZ = 0;
+    }
+
+    if (attr[1]->dtype == F16 || attr[1]->dtype == I16 || attr[1]->dtype == U16)
+    {
+        update_width = (update_width + 7) / 8;
+    }
+    else if (attr[1]->dtype == U8 || attr[1]->dtype == I8)
+    {
+        update_width = (update_width + 15) / 16;
+    }
+
+    if (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == U16)
+    {
+        block_size = (block_size + 7) / 8;
+    }
+    else if (attr[2]->dtype == U8 || attr[2]->dtype == I8)
+    {
+        block_size = (block_size + 15) / 16;
+    }
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = block_size;
+    gpu_param.global_size[1]   = index_num;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offsetY", &offsetY );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offsetZ", &offsetZ );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offsetW", &offsetW );
+        status |= vsi_nn_kernel_gpu_add_param( node, "offset_idx", &offset_idx );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+    }
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
+        (IN0_TYPE | ( OUT_TYPE << 16))
+
+    pack_key = _PACK_SELECT_KEY( attr[1]->dtype, attr[2]->dtype );
+
+    switch( pack_key )
+    {
+    case _PACK_SELECT_KEY( I8,  I8 ):
+    case _PACK_SELECT_KEY( U8,  U8 ):
+        {
+            uint16_t M1               = 0;
+            int32_t  postShift1       = 0;
+            uint32_t multAndoutZP1[2] = {0};
+
+            gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x1b1a1918, 0x1f1e1d1c, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            gpu_quantize_multiplier_16bit( (double)input1_scale * output_scale, &M1, &postShift1);
+
+            multAndoutZP1[0] = (uint32_t)(M1);
+            multAndoutZP1[1] = (uint32_t)((output_zp << postShift1) - input1_zp * M1);
+
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Lo_2x8, postShift1 );
+            gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
+
+            status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift1_Lo_2x8",  &uniU8MulAndPostShift_Lo_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                "uniU8MulAndPostShift1_Hi_2x8",  &uniU8MulAndPostShift_Hi_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
+        break;
+    default:
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_update_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_copy_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     block_size = 1;
+    int32_t     width = 0;
+    int32_t     height = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    block_size   = (int32_t)(attr[0]->shape->data[0]);
+    height = (int32_t)(attr[0]->shape->data[1]);
+    width = (int32_t)(block_size * height);
+
+    if (attr[0]->dtype == F16 || attr[0]->dtype == I16 || attr[0]->dtype == U16)
+    {
+        width = (width + 7) / 8;
+    }
+    else if (attr[0]->dtype == U8 || attr[0]->dtype == I8)
+    {
+        width = (width + 15) / 16;
+    }
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = width;
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_copy_initializer() */
+
 /*
  * Query kernel
  */
@@ -1210,7 +1791,7 @@ static vsi_status _query_kernel
 
     key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, isBig );
 
-    for( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
     {
         if ( scatter_nd_update_map[i].key == key )
         {
@@ -1263,7 +1844,7 @@ static vsi_status _query_kernel_large
 
     key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, I32, I32, 1, 1 );
 
-    for( i = 0; i < _cnt_of_array(scatter_nd_update_pre_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_pre_map); i ++ )
     {
         if ( scatter_nd_update_pre_map[i].key == key )
         {
@@ -1292,7 +1873,7 @@ static vsi_status _query_kernel_large
 
     key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0, 1 );
 
-    for( i = 0; i < _cnt_of_array(scatter_nd_update_post_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_post_map); i ++ )
     {
         if ( scatter_nd_update_post_map[i].key == key )
         {
@@ -1319,7 +1900,7 @@ static vsi_status _query_kernel_large
 
     key = HASH_SCATTER_ND_UPDATE_KEY( I32, I32, I32, 2, 1 );
 
-    for( i = 0; i < _cnt_of_array(scatter_nd_update_reset_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reset_map); i ++ )
     {
         if ( scatter_nd_update_reset_map[i].key == key )
         {
@@ -1346,6 +1927,111 @@ static vsi_status _query_kernel_large
     return status;
 } /* _query_kernel_large() */
 
+static vsi_status _query_kernel_special
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel_ref,
+    vsi_nn_kernel_t* kernel_update,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e input2_dtype = F16;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 3, 1 );
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_ref_map); i ++ )
+    {
+        if ( scatter_nd_update_ref_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_ref_map) )
+    {
+        snprintf( kernel_ref->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_ref_map[i].function_name );
+        kernel_ref->info.parameters = _scatter_nd_update_ref_kernel_param_def;
+        kernel_ref->info.numParams = _SCATTER_ND_UPDATE_REF_PARAM_NUM;
+        kernel_ref->info.initialize = _scatter_nd_update_ref_initializer;
+
+        vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                scatter_nd_update_ref_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_ref, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_ref_map[i].source_name );
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 4, 1 );
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_update_map); i ++ )
+    {
+        if ( scatter_nd_update_update_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(scatter_nd_update_update_map) )
+    {
+        snprintf( kernel_update->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_update_map[i].function_name );
+        kernel_update->info.parameters = _scatter_nd_update_update_kernel_param_def;
+        kernel_update->info.numParams = _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM;
+        kernel_update->info.initialize = _scatter_nd_update_update_initializer;
+
+        vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                scatter_nd_update_update_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_update, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_update_map[i].source_name );
+    }
+    else
+    {
+        status |= VSI_FAILURE;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 5, 1 );
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_copy_map); i ++ )
+    {
+        if ( scatter_nd_update_copy_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(scatter_nd_update_copy_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  scatter_nd_update_copy_map[i].function_name );
+        kernel->info.parameters = _scatter_nd_update_copy_kernel_param_def;
+        kernel->info.numParams = _SCATTER_ND_UPDATE_COPY_PARAM_NUM;
+        kernel->info.initialize = _scatter_nd_update_copy_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                scatter_nd_update_copy_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_copy_map[i].source_name );
+    }
+    else
+    {
+        status |= VSI_FAILURE;
+    }
+    return status;
+} /* _query_kernel_special() */
+
 static vsi_nn_kernel_node_t _setup
     (
     vsi_nn_graph_t              * graph,
@@ -1363,11 +2049,25 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
     int32_t coord_dim   = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
+    int32_t idx_num  = vsi_nn_kernel_param_get_int32( params, "idx_num" );
+    vsi_size_t *input_size = inputs[2]->attr.size;
+    uint32_t dims_num = inputs[2]->attr.dim_num;
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
     vsi_size_t width = 0, area = 0, vol = 0;
     int32_t big_flg = 0;
     vsi_nn_kernel_dtype_e update_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type);
+    vsi_nn_kernel_dtype_e ref_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
+    vsi_nn_kernel_dtype_e output_dtype = vsi_nn_kernel_map_dtype(outputs[0]->attr.dtype.vx_type);
+    int32_t type_flg = ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16) &&
+                        (update_dtype == ref_dtype && update_dtype == output_dtype)) ? 1 : 0;
+    int32_t special_flg = (block_size % 16 == 0 && type_flg)  ? 1 : 0;
     int32_t i = 0;
+    int32_t isRepeat = 0;
+
+    if (coord_dim > 4 && input_size[dims_num - 1] > 1)
+    {
+        return NULL;
+    }
 
     status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
                                                     NULL, NULL, NULL, &rs_idx_dim, &big_flg);
@@ -1380,7 +2080,122 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    if ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16))
+    check_scatter_nd_update_index_repeat(inputs, coord_dim, block_size, idx_num, &isRepeat);
+
+    if (special_flg && isRepeat == 0)
+    {
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_kernel_node_t tmp_node = NULL;
+        vsi_nn_kernel_node_t ref_node = NULL;
+        vsi_nn_kernel_node_param_t ref_params[_SCATTER_ND_UPDATE_REF_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_UPDATE_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t cpy_params[_SCATTER_ND_UPDATE_COPY_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_t * ikernels[2] = { NULL };
+        vsi_nn_tensor_t * tensors[3] = { NULL };
+
+        ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+        ikernels[0]->unique_id = kernel->unique_id;
+        ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+        ikernels[1]->unique_id = kernel->unique_id;
+
+        memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+        attr.dtype = outputs[0]->attr.dtype;
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+
+        for (i = 0; i < rs_out_dim; i++)
+        {
+            attr.size[i] = shapes[2][i];
+        }
+        attr.dim_num = rs_out_dim;
+
+        tensors[0] = vsi_nn_CreateTensor( graph, &attr );
+        attr.size[0] = 1;
+        attr.size[1] = 1;
+        tensors[1] = vsi_nn_CreateTensor( graph, &attr );
+        tensors[2] = vsi_nn_CreateTensor( graph, &attr );
+
+        status = _query_kernel_special( inputs, outputs, ikernels[0], ikernels[1], kernel);
+        if ( VSI_SUCCESS == status)
+        {
+            // convert ref to output
+            ref_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
+            if (ref_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                ref_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
+                ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                ref_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                status = vsi_nn_kernel_node_pass_param( ref_node, ref_params, _SCATTER_ND_UPDATE_REF_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &ref_params[0] );
+            }
+
+            // update
+            tmp_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
+            if (tmp_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
+                node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
+                node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
+                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &area );
+                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &vol );
+                node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+                status = vsi_nn_kernel_node_pass_param( tmp_node, node_params, _SCATTER_ND_UPDATE_UPDATE_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &node_params[0] );
+                vsi_nn_kernel_tensor_release( &node_params[1] );
+                vsi_nn_kernel_scalar_release( &node_params[5] );
+                vsi_nn_kernel_scalar_release( &node_params[6] );
+                vsi_nn_kernel_scalar_release( &node_params[7] );
+                vsi_nn_kernel_scalar_release( &node_params[8] );
+            }
+
+            // copy to output
+            node = vsi_nn_kernel_create_node( graph, kernel );
+            if ( node )
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                cpy_params[index++] = (vsi_nn_kernel_node_param_t)tensors[2]->t;
+                cpy_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
+                status = vsi_nn_kernel_node_pass_param( node, cpy_params, _SCATTER_ND_UPDATE_COPY_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &cpy_params[2] );
+            }
+        }
+
+        if ( ikernels[0] )
+        {
+            vsi_nn_kernel_release( &ikernels[0] );
+        }
+        if ( ikernels[1] )
+        {
+            vsi_nn_kernel_release( &ikernels[1] );
+        }
+        if ( tensors[0] )
+        {
+            vsi_nn_ReleaseTensor( &tensors[0] );
+        }
+        if ( tensors[1] )
+        {
+            vsi_nn_ReleaseTensor( &tensors[1] );
+        }
+        if ( tensors[2] )
+        {
+            vsi_nn_ReleaseTensor( &tensors[2] );
+        }
+        if (ref_node) {vsi_nn_kernel_node_release( &ref_node );}
+        if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
+    }
+    else if ((update_dtype == U8 || update_dtype == I8 || update_dtype == I16))
     {
         vsi_nn_tensor_attr_t attr;
         vsi_nn_kernel_node_t tmp_node = NULL;
@@ -1402,7 +2217,7 @@ static vsi_nn_kernel_node_t _setup
         attr.is_const = FALSE;
         attr.vtl = TRUE;
 
-        for(i = 0; i < rs_out_dim; i++)
+        for (i = 0; i < rs_out_dim; i++)
         {
             attr.size[i] = shapes[2][i];
         }
diff --git a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
index ac8ff6c..46595a1 100644
--- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
@@ -294,7 +294,7 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
     }
     else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        output_scale   = 1.0f / attr[1]->asymm.scale;;
+        output_scale   = 1.0f / attr[1]->asymm.scale;
         output_zp = (float)attr[1]->asymm.zero_point;
     }
 
diff --git a/src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c b/src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c
deleted file mode 100644
index 79d2e3a..0000000
--- a/src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c
+++ /dev/null
@@ -1,797 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2021 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_sp_unit_operation.h"
-#include "kernel/vsi_nn_sp_lut.h"
-
-#if (VX_STREAM_PROCESSOR_SUPPORT)
-
-vsi_nn_spinst_t * vsi_nn_sp_moments_axis1_inst
-    (
-        vx_context                context,
-        int32_t                   fifo_depth,
-        int32_t                   max_vector_depth
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    const int32_t spInitInstsNum = fifo_depth == 1 ? 4 : 3;
-    const int32_t spLoopInstsNum = fifo_depth == 2 ? 4 : 3;
-    const int32_t spCompleteInstsNum = fifo_depth == 1 ? 3 : 0;
-    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum + spCompleteInstsNum;
-
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[11];
-    vsi_nn_spinst_attr_t attr;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-
-    if (fifo_depth == 1)
-    {
-        /* init inst0: r3 = 0 */
-        status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3);
-        /* init inst1: r1 = 0 */
-        status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR1);
-        /* init inst2: r4 = 0 */
-        status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4);
-        /* init inst3: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[3]);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        /* loop inst0: r5 = r1 * r1 || r1 = in */
-        status  = vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5);
-        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
-        /* loop inst1: r3 = r3 + r1 || out = r1 */
-        status |= vsi_nn_sp_add(&sp_insts_param[5], VSI_NN_SP_SR3, VSI_NN_SP_SR1, VSI_NN_SP_SR3);
-        status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
-        /* loop inst2: r5 = r5 + r4 */
-        status |= vsi_nn_sp_add(&sp_insts_param[6], VSI_NN_SP_SR5, VSI_NN_SP_SR4, VSI_NN_SP_SR5);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        /* complete inst0: v11 = r3 */
-        status  = vsi_nn_sp_move(&sp_insts_param[7], VSI_NN_SP_SR3, VSI_NN_SP_VR11);
-        /* complete inst1: r3 = r3 + r1 || out = r1 */
-        status |= vsi_nn_sp_nop(&sp_insts_param[8]);
-        /* complete inst2: v12 = r4 */
-        status  = vsi_nn_sp_move(&sp_insts_param[9], VSI_NN_SP_SR4, VSI_NN_SP_VR12);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        attr.flush_cycle_num = 8;
-    }
-    else if (fifo_depth == 2)
-    {
-        /* init inst0: r3 = 0 */
-        status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3);
-        /* init inst1: r2 = 1 */
-        status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR2);
-        /* init inst2: r4 = 0 */
-        status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        /* loop inst0: out = r2 * r1 || v11 = r1 + r3 |  r1 = in */
-        status  = vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR2, VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
-        status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_VR11);
-        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
-        /* loop inst1: v12 = r4 + r5 | r3 = v11 */
-        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_VR12);
-        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR3);
-        /* loop inst2: r4 = v12 */
-        status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_VR12, VSI_NN_SP_SR4);
-        /* loop inst3: r5 = r1 * r1 */
-        status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        attr.flush_cycle_num = 5;
-
-        attr.ignored_leading_v11_rd = fifo_depth;
-        attr.ignored_leading_v12_rd = fifo_depth;
-        attr.ignored_leading_v11_wr = 1;
-        attr.ignored_leading_v12_wr = 1;
-
-        attr.num_of_v11_rd_in_flush_cycle = 1;
-        attr.num_of_v12_rd_in_flush_cycle = 1;
-        attr.num_of_v11_wr_in_flush_cycle = 1;
-        attr.num_of_v12_wr_in_flush_cycle = 2;
-    }
-    else
-    {
-        /* init inst0: r3 = 0 */
-        status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3);
-        /* init inst1: r2 = 0 */
-        status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR2);
-        /* init inst2: r4 = 0 */
-        status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        /* loop inst0: r5 = r1 * r1 | out = r2 + r1 || r1 = in */
-        status  = vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5);
-        status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR2, VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
-        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
-        /* loop inst1: v11 = r1 + r3 | r3 = v11 */
-        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_VR11);
-        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR3);
-        /* loop inst2: v12 = r4 + r5 | r4 = v12 */
-        status |= vsi_nn_sp_add(&sp_insts_param[5], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_VR12);
-        status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_VR12, VSI_NN_SP_SR4);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        attr.ignored_leading_v11_rd = fifo_depth;
-        attr.ignored_leading_v12_rd = fifo_depth;
-        attr.ignored_leading_v11_wr = 1;
-        attr.ignored_leading_v12_wr = 1;
-
-        attr.num_of_v11_rd_in_flush_cycle = 1;
-        attr.num_of_v12_rd_in_flush_cycle = 1;
-        attr.num_of_v11_wr_in_flush_cycle = 2;
-        attr.num_of_v12_wr_in_flush_cycle = 2;
-
-        attr.flush_cycle_num = 5;
-    }
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
-
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.prog_complete_instr_num = spCompleteInstsNum;
-    attr.ignored_leading_outputs = 1;
-    attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
-    attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X;
-    attr.split_tilex_equal_imgx = TRUE;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    spinst = vsi_nn_create_spinst_by_context(context);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-final:
-    return spinst;
-}
-
-DEF_SP_KERNEL_QUERY(moements_axis1_query)
-    (
-    vsi_nn_kernel_node_t        node
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vx_size index = 0;
-    vx_size tile_size[2] = {0};
-    vsi_nn_spinst_t *spinst = NULL;
-    int32_t fifo_depth = 0;
-    int32_t max_vector_depth = 0;
-    vx_context  ctx = vxGetContext((vx_reference)node);
-    vx_hardware_caps_params_ext2_t hw_param;
-
-    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
-    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    fifo_depth = (int32_t)ceil((float)tile_size[0] / (float)hw_param.streamProcessorExecCount);
-    max_vector_depth = hw_param.streamProcessorVectorSize;
-
-    spinst = vsi_nn_sp_moments_axis1_inst(ctx, fifo_depth, max_vector_depth);
-
-    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return status;
-}
-
-vsi_nn_kernel_node_t vsi_nn_sp_moments_axis1_node
-    (
-        vsi_nn_graph_t              * graph,
-        vsi_nn_tensor_t             * input,
-        vsi_nn_tensor_t             * output0,
-        vsi_nn_tensor_t             * output1
-    )
-{
-    const uint32_t input_count = 1;
-    const uint32_t output_count = 2;
-    vx_tensor inputs_tensor[1] = {NULL};
-    vx_tensor outputs_tensor[2] = {NULL};
-    vx_node node = NULL;
-    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
-    int32_t fifo_depth = 4;
-
-    vsi_nn_spinst_t *spinst = NULL;
-
-    spinst = vsi_nn_sp_moments_axis1_inst(graph->ctx->c, fifo_depth, max_vector_depth);
-
-    inputs_tensor[0] = input->t;
-    outputs_tensor[0] = output0->t;
-    outputs_tensor[1] = output1->t;
-    node = vxStreamProcessorNode(
-        graph->g,
-        inputs_tensor,
-        input_count,
-        outputs_tensor,
-        output_count,
-        spinst->sp,
-        NULL);
-
-    if (node)
-    {
-        vxAssignNodeQueryCallback(node, moements_axis1_query);
-    }
-
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return (vsi_nn_kernel_node_t)node;
-}
-
-vsi_nn_kernel_node_t vsi_nn_sp_ln_means_axis1_node
-    (
-        vsi_nn_graph_t  * graph,
-        vsi_nn_tensor_t * input,
-        vsi_nn_tensor_t * output,
-        float             inv_m,
-        float             const_a,
-        float             s,
-        float             eps,
-        float             output_scale
-    )
-{
-    const int32_t spInitInstsNum = 2;
-    const int32_t spLoopInstsNum = 5;
-    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
-
-    const uint32_t input_count = 1;
-    const uint32_t output_count = 1;
-    vx_tensor inputs_tensor[1] = {NULL};
-    vx_tensor outputs_tensor[1] = {NULL};
-    vx_node node = NULL;
-    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
-
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[7];
-    vsi_nn_spinst_attr_t attr;
-    vsi_nn_sp_lut_params sp_lut_params;
-    vx_lut_params_s vx_lut_params;
-
-    vsi_status status = VSI_FAILURE;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-    memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params));
-    memset(&vx_lut_params, 0, sizeof(vx_lut_params_s));
-
-    /* init inst0: r2 = const_a */
-    status  = vsi_nn_sp_move_constant(&sp_insts_param[0], const_a, VSI_NN_SP_SR2);
-    /* init inst1: r3 = inv_m */
-    status  = vsi_nn_sp_move_constant(&sp_insts_param[1], inv_m, VSI_NN_SP_SR3);
-    /* loop inst0: r4 = v11 * v11 || r6 = r4 + r5 || r5 = v11*/
-    status  = vsi_nn_sp_mul(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11, VSI_NN_SP_SR4);
-    status |= vsi_nn_sp_add(&sp_insts_param[2], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_SR6);
-    status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_SR5);
-    /* loop inst1: r1 = pwlMul() || r7 = pwlAdd() */
-    status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR1);
-    status |= vsi_nn_sp_sub(&sp_insts_param[3], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR7);
-    /* loop inst2: r5 = r2 * v12 || v12 = r8 + r7 */
-    status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_VR12, VSI_NN_SP_SR5);
-    status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR8, VSI_NN_SP_SR7, VSI_NN_SP_VR12);
-    /* loop inst3: r1 = setup(r6) || v11 = r3 * r5 || r7 = r1 */
-    status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR6, VSI_NN_SP_SR1);
-    status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR3, VSI_NN_SP_SR5, VSI_NN_SP_VR11);
-    status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR7);
-    /* loop inst3: r8 = r1 * r7 */
-    status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SR7, VSI_NN_SP_SR8);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
-
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_V11;
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.ignored_leading_outputs = 0;
-    attr.ignored_leading_v11_wr = 0;
-    attr.ignored_leading_v12_wr = 3;
-    attr.ignored_leading_v11_rd = 0;
-    attr.flush_cycle_num = 17;
-
-    attr.num_of_v11_rd_in_flush_cycle = 0;
-    attr.num_of_v12_rd_in_flush_cycle = 1;
-    attr.num_of_v11_wr_in_flush_cycle = 1;
-    attr.num_of_v12_wr_in_flush_cycle = 4;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X;
-    attr.split_tilex_equal_imgx = TRUE;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    spinst = vsi_nn_create_spinst(graph);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    inputs_tensor[0] = input->t;
-    outputs_tensor[0] = output->t;
-
-    vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM;
-    vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
-    vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
-
-    sp_lut_params.act_type = VSI_NN_SP_ACT_LINEAR_RSQRT;
-    sp_lut_params.params[0] = s;
-    sp_lut_params.params[1] = eps;
-    sp_lut_params.params[2] = output_scale;
-    vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params);
-
-    node = vxStreamProcessorNode(
-        graph->g,
-        inputs_tensor,
-        input_count,
-        outputs_tensor,
-        output_count,
-        spinst->sp,
-        &vx_lut_params);
-
-final:
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    if (vx_lut_params.in_lut)
-    {
-        vxReleaseLUT(&vx_lut_params.in_lut);
-        vx_lut_params.in_lut = NULL;
-    }
-    if (vx_lut_params.out_lut)
-    {
-        vxReleaseLUT(&vx_lut_params.out_lut);
-        vx_lut_params.out_lut = NULL;
-    }
-
-    return (vsi_nn_kernel_node_t)node;
-}
-
-vsi_nn_spinst_t * vsi_nn_sp_layer_norm_axis1_inst
-    (
-        vx_context                context,
-        int32_t                   fifo_depth,
-        int32_t                   max_vector_depth
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    const int32_t spInitInstsNum = 0;
-    const int32_t spLoopInstsNum = fifo_depth > 3 ? 2 : 5;
-    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
-
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[5];
-    vsi_nn_spinst_attr_t attr;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-
-    if (fifo_depth > 3)
-    {
-        /* loop inst0: out = in - v11 || v11 = v11 */
-        status  = vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1);
-        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
-        /* loop inst1: out = r1 * v12 | v12 = v12 */
-        status |= vsi_nn_sp_mul(&sp_insts_param[1], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT);
-        status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        attr.flush_cycle_num = 3;
-        attr.ignored_leading_v12_rd = 1;
-        attr.ignored_leading_v12_wr = 1;
-
-        attr.num_of_v11_rd_in_flush_cycle = 0;
-        attr.num_of_v12_rd_in_flush_cycle = 2;
-        attr.num_of_v11_wr_in_flush_cycle = 0;
-        attr.num_of_v12_wr_in_flush_cycle = 2;
-    }
-    else
-    {
-        /* loop inst0: out = in - v11 || v11 = v11 */
-        status  = vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1);
-        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
-        /* loop inst1: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[1]);
-        /* loop inst2: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[2]);
-        /* loop inst3: out = r1 * v12 | v12 = v12 */
-        status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT);
-        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
-        /* loop inst4: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[4]);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        attr.flush_cycle_num = 4;
-        attr.ignored_leading_v12_rd = 0;
-        attr.ignored_leading_v12_wr = 0;
-
-        attr.num_of_v11_rd_in_flush_cycle = 0;
-        attr.num_of_v12_rd_in_flush_cycle = 1;
-        attr.num_of_v11_wr_in_flush_cycle = 0;
-        attr.num_of_v12_wr_in_flush_cycle = 1;
-    }
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
-
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.ignored_leading_outputs = 0;
-    attr.ignored_leading_v11_rd = 0;
-    attr.ignored_leading_v11_wr = 0;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X;
-    attr.split_tilex_equal_imgx = TRUE;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    spinst = vsi_nn_create_spinst_by_context(context);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-final:
-    return spinst;
-}
-
-DEF_SP_KERNEL_QUERY(layer_norm_axis1_query)
-    (
-    vsi_nn_kernel_node_t        node
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vx_size index = 0;
-    vx_size tile_size[2] = {0};
-    vsi_nn_spinst_t *spinst = NULL;
-    int32_t fifo_depth = 0;
-    int32_t max_vector_depth = 0;
-    vx_context  ctx = vxGetContext((vx_reference)node);
-    vx_hardware_caps_params_ext2_t hw_param;
-
-    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
-    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    fifo_depth = (int32_t)ceil((float)tile_size[0] / (float)hw_param.streamProcessorExecCount);
-    max_vector_depth = hw_param.streamProcessorVectorSize;
-
-    spinst = vsi_nn_sp_layer_norm_axis1_inst(ctx, fifo_depth, max_vector_depth);
-
-    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return status;
-}
-
-vsi_nn_kernel_node_t vsi_nn_sp_layer_norm_axis1_node
-    (
-        vsi_nn_graph_t              * graph,
-        vsi_nn_tensor_t             * input0,
-        vsi_nn_tensor_t             * input1,
-        vsi_nn_tensor_t             * output
-    )
-{
-    const uint32_t input_count = 2;
-    const uint32_t output_count = 1;
-    vx_tensor inputs_tensor[2] = {NULL};
-    vx_tensor outputs_tensor[1] = {NULL};
-    vx_node node = NULL;
-    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
-    int32_t fifo_depth = 4;
-    vsi_nn_spinst_t *spinst = NULL;
-
-    spinst = vsi_nn_sp_layer_norm_axis1_inst(graph->ctx->c, fifo_depth, max_vector_depth);
-
-    inputs_tensor[0] = input0->t;
-    inputs_tensor[1] = input1->t;
-    outputs_tensor[0] = output->t;
-    node = vxStreamProcessorNode(
-        graph->g,
-        inputs_tensor,
-        input_count,
-        outputs_tensor,
-        output_count,
-        spinst->sp,
-        NULL);
-
-    if (node)
-    {
-        vxAssignNodeQueryCallback(node, layer_norm_axis1_query);
-    }
-
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return (vsi_nn_kernel_node_t)node;
-}
-
-vsi_nn_kernel_node_t vsi_nn_sp_load_weight_bias_node
-    (
-        vsi_nn_graph_t              * graph,
-        vsi_nn_tensor_t             * weight,
-        vsi_nn_tensor_t             * bias,
-        vsi_nn_tensor_t             * dummy_output
-    )
-{
-    const int32_t spLoopInstsNum = 2;
-    const int32_t spInstsNum = spLoopInstsNum;
-
-    const uint32_t input_count = 2;
-    const uint32_t output_count = 1;
-    vx_tensor inputs_tensor[2] = {NULL};
-    vx_tensor outputs_tensor[2] = {NULL};
-    vx_node node = NULL;
-    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth /
-        graph->ctx->config.sp_exec_count;
-
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[2];
-    vsi_nn_spinst_attr_t attr;
-
-    vsi_status status = VSI_FAILURE;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-
-    /* loop inst0: v11 = in*/
-    status  = vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11);
-    /* loop inst0: v12 = in*/
-    status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_SRIN, VSI_NN_SP_VR12);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_INTERLEAVE_TWO_INPUT;
-
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.ignored_leading_outputs = 0;
-    attr.flush_cycle_num = 0;
-    attr.ignored_leading_v11_rd = 0;
-    attr.ignored_leading_v11_wr = 0;
-    attr.ignored_leading_v12_rd = 0;
-    attr.ignored_leading_v12_wr = 0;
-    attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
-    attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
-    attr.ch0_post_redistribute = VSI_NN_SP_CH_POST_REDISTRIBUTE_VECTOR_GATHER;
-    attr.ch1_post_redistribute = VSI_NN_SP_CH_POST_REDISTRIBUTE_VECTOR_GATHER;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    spinst = vsi_nn_create_spinst(graph);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    inputs_tensor[0] = weight->t;
-    inputs_tensor[1] = bias->t;
-    outputs_tensor[0] = dummy_output->t;
-
-    node = vxStreamProcessorNode(
-        graph->g,
-        inputs_tensor,
-        input_count,
-        outputs_tensor,
-        output_count,
-        spinst->sp,
-        NULL);
-
-final:
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return (vsi_nn_kernel_node_t)node;
-}
-
-vsi_nn_kernel_node_t vsi_nn_sp_in_times_v11_plus_v12_node
-    (
-        vsi_nn_graph_t              * graph,
-        vsi_nn_tensor_t             * input,
-        vsi_nn_tensor_t             * dummy_tensor,
-        vsi_nn_tensor_t             * output
-    )
-{
-    const int32_t spLoopInstsNum = 1;
-    const int32_t spInstsNum = spLoopInstsNum;
-
-    const uint32_t input_count = 2;
-    const uint32_t output_count = 1;
-    vx_tensor inputs_tensor[3] = {NULL};
-    vx_tensor outputs_tensor[1] = {NULL};
-    vx_node node = NULL;
-    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth /
-        graph->ctx->config.sp_exec_count;
-
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[1];
-    vsi_nn_spinst_attr_t attr;
-
-    vsi_status status = VSI_FAILURE;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-
-    /* loop inst0: r1 = in * v11 || out = r1 + v12 */
-    status  = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1);
-    status |= vsi_nn_sp_add(&sp_insts_param[0], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
-
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.ignored_leading_outputs = 3;
-    attr.ignored_leading_v11_rd = 0;
-    attr.ignored_leading_v12_rd = 3;
-    attr.flush_cycle_num = 3;
-    attr.v11_push_pop_config = VSI_NN_SP_PUSH_POP_EVERY_ROW;
-    attr.v12_push_pop_config = VSI_NN_SP_PUSH_POP_EVERY_ROW;
-
-    attr.num_of_v11_rd_in_flush_cycle = 0;
-    attr.num_of_v12_rd_in_flush_cycle = 3;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    spinst = vsi_nn_create_spinst(graph);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    inputs_tensor[0] = input->t;
-    inputs_tensor[1] = dummy_tensor->t;
-    outputs_tensor[0] = output->t;
-    node = vxStreamProcessorNode(
-        graph->g,
-        inputs_tensor,
-        input_count,
-        outputs_tensor,
-        output_count,
-        spinst->sp,
-        NULL);
-
-final:
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return (vsi_nn_kernel_node_t)node;
-}
-
-/*
-** This program requires sum operation in the Y dimension.
-** Instead of using the SUM Engine, the sum needs to be performed
-** by Stream Processor instructions.
-*/
-vsi_nn_kernel_node_t layer_norm_y_direction
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    vsi_nn_tensor_t            ** outputs,
-    const vsi_nn_kernel_param_t * params
-    )
-{
-    vsi_nn_kernel_node_t node = NULL;
-    vsi_nn_tensor_attr_t attr;
-    vsi_nn_tensor_t * dummy_tensor[3] = {NULL};
-    vsi_nn_tensor_t * output_tensor[2] = {NULL};
-    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
-    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
-    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
-    float inv_m = 1.0f / (float)(outputs[0]->attr.size[0]);
-    float s = inv_m * inv_m;
-    float const_a = (float)(outputs[0]->attr.size[0]);
-
-    memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
-    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
-    attr.is_const = FALSE;
-    attr.vtl = TRUE;
-    attr.is_dummy = TRUE;
-    attr.size[axis] = 1;
-    dummy_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( dummy_tensor[0], "Create dummy_tensor fail.", final );
-    dummy_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( dummy_tensor[1], "Create dummy_tensor fail.", final );
-    memcpy( &attr.size, &inputs[2]->attr.size, sizeof(inputs[2]->attr.size) );
-    attr.dim_num = inputs[2]->attr.dim_num;
-    dummy_tensor[2] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( dummy_tensor[2], "Create dummy_tensor fail.", final );
-
-    memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
-    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
-    attr.is_const = FALSE;
-    attr.vtl = TRUE;
-    output_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( output_tensor[0], "Create tensor fail.", final );
-    output_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( output_tensor[1], "Create tensor fail.", final );
-
-    node = vsi_nn_sp_moments_axis1_node(graph, inputs[0], output_tensor[0], dummy_tensor[0]);
-    CHECK_PTR_FAIL_GOTO( node, "Create sp_moments_axis1 fail.", final );
-    node = vsi_nn_sp_ln_means_axis1_node(graph, dummy_tensor[0], dummy_tensor[1],
-        inv_m, const_a, s, eps, output_scale);
-    CHECK_PTR_FAIL_GOTO( node, "Create ln_y_dirction_means  fail.", final );
-    node = vsi_nn_sp_layer_norm_axis1_node(graph, output_tensor[0], dummy_tensor[1], output_tensor[1]);
-    CHECK_PTR_FAIL_GOTO( node, "Create layer_norm_axis1 fail.", final );
-
-    node = vsi_nn_sp_load_weight_bias_node(graph, inputs[2], inputs[1], dummy_tensor[2]);
-    CHECK_PTR_FAIL_GOTO( node, "Create mov_weight_bias fail.", final );
-    node = vsi_nn_sp_in_times_v11_plus_v12_node(graph, output_tensor[1], dummy_tensor[2], outputs[0]);
-    CHECK_PTR_FAIL_GOTO( node, "Create in_times_v11_plus_v12 fail.", final );
-
-final:
-    vsi_safe_release_tensor(dummy_tensor[0]);
-    vsi_safe_release_tensor(dummy_tensor[1]);
-    vsi_safe_release_tensor(dummy_tensor[2]);
-    vsi_safe_release_tensor(output_tensor[0]);
-    vsi_safe_release_tensor(output_tensor[1]);
-
-    return node;
-} /* layer_norm_y_direction() */
-
-
-#endif
diff --git a/src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c b/src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c
deleted file mode 100644
index cb550c2..0000000
--- a/src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c
+++ /dev/null
@@ -1,938 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2021 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_error.h"
-#include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_sp_unit_operation.h"
-#include "kernel/vsi_nn_sp_lut.h"
-
-#if (VX_STREAM_PROCESSOR_SUPPORT)
-
-vsi_nn_spinst_t * vsi_nn_sp_max_axis2_inst
-    (
-        vx_context                context,
-        int32_t                   fifo_depth,
-        int32_t                   max_vector_depth
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    const int32_t spInitInstsNum = 4;
-    const int32_t spLoopInstsNum = fifo_depth > 4 ? 3 : 11;
-    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
-    uint32_t f32_min = 0xff800000;
-    float clampMin = *(float*)&f32_min;
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[15];
-    vsi_nn_spinst_attr_t attr;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-
-    /* init inst0: r2 = -INF */
-    status = vsi_nn_sp_move_constant(&sp_insts_param[0], clampMin, VSI_NN_SP_SR2);
-    /* init inst1: r10 = 0 */
-    status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR10);
-    /* init inst2: r4 = 1 */
-    status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 1, VSI_NN_SP_SR4);
-    /* init inst3: nop */
-    status |= vsi_nn_sp_nop(&sp_insts_param[3]);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-
-    if (fifo_depth > 4)
-    {
-        /* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */
-        status  = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
-        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2);
-        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_SR9);
-        /* loop inst1: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */
-        status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8);
-        status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5);
-        status |= vsi_nn_sp_move_sel0(&sp_insts_param[5], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11);
-        /* loop inst2: out = r1 */
-        status |= vsi_nn_sp_move(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        attr.flush_cycle_num = 7;
-
-        attr.ignored_leading_outputs = 1;
-        attr.ignored_leading_v11_rd = fifo_depth;
-        attr.ignored_leading_v11_wr = 2;
-
-        attr.num_of_v11_rd_in_flush_cycle = 0;
-        attr.num_of_v11_wr_in_flush_cycle = 3;
-    }
-    else
-    {
-        /* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */
-        status  = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
-        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2);
-        /* loop inst1: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[5]);
-        /* loop inst2: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[6]);
-        /* loop inst3: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */
-        status |= vsi_nn_sp_mul(&sp_insts_param[7], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8);
-        status |= vsi_nn_sp_sub(&sp_insts_param[7], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5);
-        status |= vsi_nn_sp_move(&sp_insts_param[7], VSI_NN_SP_SR2, VSI_NN_SP_SR9);
-        /* loop inst4: out = r1 */
-        status |= vsi_nn_sp_move(&sp_insts_param[8], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
-        /* loop inst5: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[9]);
-        /* loop inst6: nop */
-        status |= vsi_nn_sp_move_sel0(&sp_insts_param[10], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11);
-        /* loop inst7: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[11]);
-        /* loop inst8: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[12]);
-        /* loop inst9: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[13]);
-        /* loop inst10: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[14]);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        attr.ignored_leading_outputs = 0;
-        attr.ignored_leading_v11_rd = fifo_depth;
-        attr.ignored_leading_v11_wr = 0;
-
-        attr.num_of_v11_rd_in_flush_cycle = 0;
-        attr.num_of_v11_wr_in_flush_cycle = 1;
-
-        attr.flush_cycle_num = 10;
-    }
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
-
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
-    attr.split_tilex_equal_imgx = TRUE;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    spinst = vsi_nn_create_spinst_by_context(context);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-final:
-    return spinst;
-}
-
-DEF_SP_KERNEL_QUERY(max_axis2_query)
-    (
-    vsi_nn_kernel_node_t        node
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vx_size index = 0;
-    vx_size tile_size[2] = {0};
-    vsi_nn_spinst_t *spinst = NULL;
-    int32_t fifo_depth = 0;
-    int32_t max_vector_depth = 0;
-    vx_context  ctx = vxGetContext((vx_reference)node);
-    vx_hardware_caps_params_ext2_t hw_param;
-
-    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
-    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1]) / (float)hw_param.streamProcessorExecCount);
-    max_vector_depth = hw_param.streamProcessorVectorSize;
-
-    spinst = vsi_nn_sp_max_axis2_inst(ctx, fifo_depth, max_vector_depth);
-
-    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return status;
-}
-
-vsi_nn_kernel_node_t vsi_nn_sp_max_axis2_node
-    (
-        vsi_nn_graph_t              * graph,
-        vsi_nn_tensor_t             * input,
-        vsi_nn_tensor_t             * output0,
-        vsi_nn_tensor_t             * output1
-    )
-{
-    const int32_t spInitInstsNum = 4;
-    const int32_t spLoopInstsNum = 3;
-    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
-
-    const uint32_t input_count = 1;
-    const uint32_t output_count = 2;
-    vx_tensor inputs_tensor[1] = {NULL};
-    vx_tensor outputs_tensor[2] = {NULL};
-    vx_node node = NULL;
-
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[7];
-    vsi_nn_spinst_attr_t attr;
-
-    vsi_status status = VSI_FAILURE;
-    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
-    uint32_t f32_min = 0xff800000;
-    float flt_min = *(float*)&f32_min;
-    float input_scale = vsi_nn_get_tensor_scale(input);
-    float clamp_min = 0;
-    float clamp_max = 0;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-
-    vsi_nn_get_tensor_clamp_min_max(input, &clamp_min, &clamp_max);
-    clamp_min = clamp_min * input_scale;
-    clamp_max = clamp_max * input_scale;
-
-    /* init inst0: r2 = -INF */
-    status = vsi_nn_sp_move_constant(&sp_insts_param[0], flt_min, VSI_NN_SP_SR2);
-    /* init inst1: r10 = 0 */
-    status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR10);
-    /* init inst2: r4 = 1 */
-    status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 1, VSI_NN_SP_SR4);
-    /* init inst3: nop */
-    status |= vsi_nn_sp_nop(&sp_insts_param[3]);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-
-    /* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */
-    status  = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
-    status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2);
-    status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_SR9);
-    /* loop inst1: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */
-    status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8);
-    status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5);
-    status |= vsi_nn_sp_move_sel0(&sp_insts_param[5], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11);
-    /* loop inst2: out = r1 */
-    status |= vsi_nn_sp_move(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
-
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
-    attr.split_tilex_equal_imgx = TRUE;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    attr.flush_cycle_num = 7;
-
-    attr.ignored_leading_outputs = 1;
-    attr.ignored_leading_v11_rd = 5;
-    attr.ignored_leading_v11_wr = 2;
-
-    attr.num_of_v11_rd_in_flush_cycle = 0;
-    attr.num_of_v11_wr_in_flush_cycle = 3;
-
-    VSI_NN_SP_ATTR_SET_CONST_TO_SR3(attr, input_scale);
-    VSI_NN_SP_ATTR_SET_CONST_TO_SR6(attr, clamp_max);
-    VSI_NN_SP_ATTR_SET_CONST_TO_SR7(attr, clamp_min);
-
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-
-    spinst = vsi_nn_create_spinst(graph);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    inputs_tensor[0] = input->t;
-    outputs_tensor[0] = output0->t;
-    outputs_tensor[1] = output1->t;
-    node = vxStreamProcessorNode(
-        graph->g,
-        inputs_tensor,
-        input_count,
-        outputs_tensor,
-        output_count,
-        spinst->sp,
-        NULL);
-
-final:
-
-    if (node)
-    {
-        vxAssignNodeQueryCallback(node, max_axis2_query);
-    }
-
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return (vsi_nn_kernel_node_t)node;
-}
-
-vsi_nn_spinst_t * vsi_nn_sp_exp_y_direction_inst
-    (
-        vx_context                context,
-        int32_t                   fifo_depth,
-        int32_t                   max_vector_depth
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    const int32_t spInitInstsNum = 2;
-    const int32_t spLoopInstsNum = fifo_depth > 3 ? 4 : 8;
-    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[10];
-    vsi_nn_spinst_attr_t attr;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-
-    /* init inst0: r8 = 0 */
-    status  = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR8);
-    /* init inst1: r9 = 1 */
-    status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR9);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-
-    if (fifo_depth > 3)
-    {
-        /* loop inst0: r2 = in - v11 | v11 = v11 */
-        status  = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2);
-        status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
-        /* loop inst1: r8 = v12 * r9 | r7 = r4 + r6 | out = r7 */
-        status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8);
-        status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
-        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR7, VSI_NN_SP_SROUT);
-        /* loop inst2: r6 = r5 * r2 | v12 = r7 + r8 | r4 = r3 */
-        status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
-        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12);
-        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SR4);
-        /* loop inst3: r1 = setup(r2) | r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/
-        status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1);
-        status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
-        status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
-        status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        attr.flush_cycle_num = 18;
-
-        attr.ignored_leading_outputs = 4;
-        attr.ignored_leading_v11_rd = 0;
-        attr.ignored_leading_v11_wr = 0;
-        attr.ignored_leading_v12_rd = fifo_depth + 3;
-        attr.ignored_leading_v12_wr = 4;
-
-        attr.num_of_v12_rd_in_flush_cycle = 4;
-        attr.num_of_v12_wr_in_flush_cycle = 5;
-    }
-    else
-    {
-        /* loop inst0: r2 = in - v11 | v11 = v11 */
-        status  = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2);
-        status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
-        /* loop inst1: r6 = r5 * r2 | r4 = r3 */
-        status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
-        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR3, VSI_NN_SP_SR4);
-        /* loop inst2: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[4]);
-        /* loop inst3: r1 = setup(r2) */
-        status  = vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1);
-        /* loop inst4: r8 = v12 * r9 | r7 = r4 + r6 */
-        status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8);
-        status |= vsi_nn_sp_add(&sp_insts_param[6], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
-        /* loop inst5: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[7]);
-        /* loop inst6: r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/
-        status |= vsi_nn_sp_mul(&sp_insts_param[8], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
-        status |= vsi_nn_sp_sub(&sp_insts_param[8], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
-        status |= vsi_nn_sp_move(&sp_insts_param[8], VSI_NN_SP_SR1, VSI_NN_SP_SR3);
-        /* loop inst7: v12 = r7 + r8 | out = r7 */
-        status |= vsi_nn_sp_add(&sp_insts_param[9], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12);
-        status |= vsi_nn_sp_move(&sp_insts_param[9], VSI_NN_SP_SR7, VSI_NN_SP_SROUT);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-
-        attr.ignored_leading_outputs = 1;
-        attr.ignored_leading_v11_rd = 0;
-        attr.ignored_leading_v11_wr = 0;
-        attr.ignored_leading_v12_rd = fifo_depth + 1;
-        attr.ignored_leading_v12_wr = 1;
-
-        attr.num_of_v12_rd_in_flush_cycle = 2;
-        attr.num_of_v12_wr_in_flush_cycle = 2;
-
-        attr.flush_cycle_num = 15;
-    }
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
-
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
-    attr.split_tilex_equal_imgx = TRUE;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    spinst = vsi_nn_create_spinst_by_context(context);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-final:
-    return spinst;
-}
-
-DEF_SP_KERNEL_QUERY(softmax_z_direction_exp_query)
-    (
-    vsi_nn_kernel_node_t        node
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vx_size index = 0;
-    vx_size tile_size[2] = {0};
-    vsi_nn_spinst_t *spinst = NULL;
-    int32_t fifo_depth = 0;
-    int32_t max_vector_depth = 0;
-    vx_context  ctx = vxGetContext((vx_reference)node);
-    vx_hardware_caps_params_ext2_t hw_param;
-
-    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
-    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1])/ (float)hw_param.streamProcessorExecCount);
-    max_vector_depth = hw_param.streamProcessorVectorSize;
-
-    spinst = vsi_nn_sp_exp_y_direction_inst(ctx, fifo_depth, max_vector_depth);
-
-    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return status;
-}
-
-vsi_nn_kernel_node_t vsi_nn_sp_softmax_z_direction_exp_node
-    (
-        vsi_nn_graph_t              * graph,
-        vsi_nn_tensor_t             * input0,
-        vsi_nn_tensor_t             * input1,
-        vsi_nn_tensor_t             * output0,
-        vsi_nn_tensor_t             * output1,
-        float                         beta
-    )
-{
-    const int32_t spInitInstsNum = 2;
-    const int32_t spLoopInstsNum = 4;
-    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
-
-    const uint32_t input_count = 2;
-    const uint32_t output_count = 2;
-    vx_tensor inputs_tensor[2] = {NULL};
-    vx_tensor outputs_tensor[2] = {NULL};
-    vx_node node = NULL;
-
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[6];
-    vsi_nn_spinst_attr_t attr;
-
-    vsi_nn_sp_lut_params sp_lut_params;
-    vx_lut_params_s vx_lut_params;
-
-    vsi_status status = VSI_FAILURE;
-    int32_t fifo_depth = 4;
-    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-    memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params));
-    memset(&vx_lut_params, 0, sizeof(vx_lut_params_s));
-
-    /* init inst0: r8 = 0 */
-    status  = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR8);
-    /* init inst1: r9 = 1 */
-    status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR9);
-    CHECK_STATUS_FAIL_GOTO(status, final);
-
-    /* loop inst0: r2 = in - v11 | v11 = v11 */
-    status  = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2);
-    status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
-    /* loop inst1: r8 = v12 * r9 | r7 = r4 + r6 | out = r7 */
-    status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8);
-    status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
-    status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR7, VSI_NN_SP_SROUT);
-    /* loop inst2: r6 = r5 * r2 | v12 = r7 + r8 | r4 = r3 */
-    status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
-    status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12);
-    status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SR4);
-    /* loop inst3: r1 = setup(r2) | r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/
-    status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1);
-    status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
-    status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
-    status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    attr.flush_cycle_num = 18;
-
-    attr.ignored_leading_outputs = 4;
-    attr.ignored_leading_v11_rd = 0;
-    attr.ignored_leading_v11_wr = 0;
-    attr.ignored_leading_v12_rd = fifo_depth + 3;
-    attr.ignored_leading_v12_wr = 4;
-
-    attr.num_of_v12_rd_in_flush_cycle = 4;
-    attr.num_of_v12_wr_in_flush_cycle = 5;
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
-
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
-    attr.split_tilex_equal_imgx = TRUE;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-
-    spinst = vsi_nn_create_spinst(graph);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM;
-    vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
-    vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
-
-    sp_lut_params.act_type = VSI_NN_SP_ACT_LINEAR_EXP;
-    sp_lut_params.params[0] = beta;
-    sp_lut_params.params[1] = 0;
-    vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params);
-
-    inputs_tensor[0] = input0->t;
-    inputs_tensor[1] = input1->t;
-    outputs_tensor[0] = output0->t;
-    outputs_tensor[1] = output1->t;
-    node = vxStreamProcessorNode(
-        graph->g,
-        inputs_tensor,
-        input_count,
-        outputs_tensor,
-        output_count,
-        spinst->sp,
-        &vx_lut_params);
-
-final:
-    if (node)
-    {
-        vxAssignNodeQueryCallback(node, softmax_z_direction_exp_query);
-    }
-
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    if (vx_lut_params.in_lut)
-    {
-        vxReleaseLUT(&vx_lut_params.in_lut);
-        vx_lut_params.in_lut = NULL;
-    }
-
-    if (vx_lut_params.out_lut)
-    {
-        vxReleaseLUT(&vx_lut_params.out_lut);
-        vx_lut_params.out_lut = NULL;
-    }
-
-    return (vsi_nn_kernel_node_t)node;
-}
-vsi_nn_kernel_node_t vsi_nn_sp_rcp_node
-    (
-        vsi_nn_graph_t  * graph,
-        vsi_nn_tensor_t * input,
-        vsi_nn_tensor_t * output,
-        float             output_scale
-    )
-{
-    const int32_t spLoopInstsNum = 3;
-    const int32_t spInstsNum = spLoopInstsNum;
-
-    const uint32_t input_count = 1;
-    const uint32_t output_count = 1;
-    vx_tensor inputs_tensor[1] = {NULL};
-    vx_tensor outputs_tensor[1] = {NULL};
-    vx_node node = NULL;
-    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
-
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[3];
-    vsi_nn_spinst_attr_t attr;
-
-    vsi_nn_sp_lut_params sp_lut_params;
-    vx_lut_params_s vx_lut_params;
-
-    vsi_status status = VSI_FAILURE;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-    memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params));
-    memset(&vx_lut_params, 0, sizeof(vx_lut_params_s));
-
-    /* loop inst0: r1 = pwlSetup(v12) | r5 = pwlMul() | r2 = pwlAdd() | r8 = r1 */
-    status  = vsi_nn_sp_pwl_setup0(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SR1);
-    status |= vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
-    status |= vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
-    status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_SR1, VSI_NN_SP_SR8);
-    /* loop inst1: r6 = r5 * r2 | r7 = r4 + r6 | r4 = r8 */
-    status |= vsi_nn_sp_mul(&sp_insts_param[1], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
-    status |= vsi_nn_sp_add(&sp_insts_param[1], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
-    status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_SR4, VSI_NN_SP_SR8);
-    /* loop inst1: v12 = r7 * r3 */
-    status |= vsi_nn_sp_mul(&sp_insts_param[2], VSI_NN_SP_SR7, VSI_NN_SP_SR3, VSI_NN_SP_VR12);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
-
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_V12;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-    attr.ignored_leading_v12_wr = 4;
-    attr.ignored_leading_v12_rd = 0;
-    attr.flush_cycle_num = 14;
-
-    attr.num_of_v12_wr_in_flush_cycle = 5;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    VSI_NN_SP_ATTR_SET_CONST_TO_SR3(attr, 1.0f / output_scale);
-
-    spinst = vsi_nn_create_spinst(graph);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-    inputs_tensor[0] = input->t;
-    outputs_tensor[0] = output->t;
-
-    vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM;
-    vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
-    vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
-
-    sp_lut_params.act_type = VSI_NN_SP_ACT_RCP;
-    vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params);
-
-    node = vxStreamProcessorNode(
-        graph->g,
-        inputs_tensor,
-        input_count,
-        outputs_tensor,
-        output_count,
-        spinst->sp,
-        &vx_lut_params);
-
-final:
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    if (vx_lut_params.in_lut)
-    {
-        vxReleaseLUT(&vx_lut_params.in_lut);
-        vx_lut_params.in_lut = NULL;
-    }
-    if (vx_lut_params.out_lut)
-    {
-        vxReleaseLUT(&vx_lut_params.out_lut);
-        vx_lut_params.out_lut = NULL;
-    }
-
-    return (vsi_nn_kernel_node_t)node;
-}
-
-vsi_nn_spinst_t * vsi_nn_sp_times_inst
-    (
-        vx_context                context,
-        int32_t                   fifo_depth,
-        int32_t                   max_vector_depth
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    const int32_t spInitInstsNum = 0;
-    const int32_t spLoopInstsNum = fifo_depth > 4 ? 1 : fifo_depth > 1 ? 3 : 5;
-    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
-    vsi_nn_spinst_t *spinst = NULL;
-    vsi_nn_spinst_inst_param sp_insts_param[5];
-    vsi_nn_spinst_attr_t attr;
-
-    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
-    vsi_nn_init_spinst_attr(&attr);
-
-    if (fifo_depth > 4)
-    {
-        /* loop inst0: out = v12 * in | v12 = v12 */
-        status  = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT);
-        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (fifo_depth > 1)
-    {
-        /* loop inst0: out = v12 * in | v12 = v12 */
-        status  = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT);
-        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
-        /* loop inst1: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[1]);
-        /* loop inst2: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[2]);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        /* loop inst0: out = v12 * in | v12 = v12 */
-        status  = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT);
-        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
-        /* loop inst1: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[1]);
-        /* loop inst2: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[2]);
-        /* loop inst3: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[3]);
-        /* loop inst4: nop */
-        status |= vsi_nn_sp_nop(&sp_insts_param[4]);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-
-    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
-    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
-
-    attr.prog_init_instr_num = spInitInstsNum;
-    attr.prog_loop_instr_num = spLoopInstsNum;
-
-    attr.flush_cycle_num = 0;
-
-    attr.ignored_leading_outputs = 0;
-    attr.ignored_leading_v11_rd = 0;
-    attr.ignored_leading_v11_wr = 0;
-
-    attr.num_of_v11_rd_in_flush_cycle = 0;
-    attr.num_of_v11_wr_in_flush_cycle = 0;
-
-    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
-    attr.split_tilex_equal_imgx = TRUE;
-    attr.split_max_vector_depth = max_vector_depth;
-
-    spinst = vsi_nn_create_spinst_by_context(context);
-    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
-    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
-    status |= vsi_nn_set_spinst_attr(spinst, attr);
-    CHECK_STATUS_FAIL_GOTO(status, final );
-
-final:
-    return spinst;
-}
-
-DEF_SP_KERNEL_QUERY(times_query)
-    (
-    vsi_nn_kernel_node_t        node
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vx_size index = 0;
-    vx_size tile_size[2] = {0};
-    vsi_nn_spinst_t *spinst = NULL;
-    int32_t fifo_depth = 0;
-    int32_t max_vector_depth = 0;
-    vx_context  ctx = vxGetContext((vx_reference)node);
-    vx_hardware_caps_params_ext2_t hw_param;
-
-    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
-    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-    fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1]) / (float)hw_param.streamProcessorExecCount);
-    max_vector_depth = hw_param.streamProcessorVectorSize;
-
-    spinst = vsi_nn_sp_times_inst(ctx, fifo_depth, max_vector_depth);
-
-    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
-    CHECK_STATUS_FAIL_GOTO( status, final );
-
-final:
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return status;
-}
-
-vsi_nn_kernel_node_t vsi_nn_sp_softmax_z_direction_times_node
-    (
-        vsi_nn_graph_t              * graph,
-        vsi_nn_tensor_t             * input0,
-        vsi_nn_tensor_t             * input1,
-        vsi_nn_tensor_t             * output
-    )
-{
-    const uint32_t input_count = 2;
-    const uint32_t output_count = 1;
-    vx_tensor inputs_tensor[2] = {NULL, NULL};
-    vx_tensor outputs_tensor[1] = {NULL};
-    vx_node node = NULL;
-    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
-    int32_t fifo_depth = 5;
-
-    vsi_nn_spinst_t *spinst = NULL;
-
-    spinst = vsi_nn_sp_times_inst(graph->ctx->c, fifo_depth, max_vector_depth);
-
-    inputs_tensor[0] = input0->t;
-    inputs_tensor[1] = input1->t;
-    outputs_tensor[0] = output->t;
-    node = vxStreamProcessorNode(
-        graph->g,
-        inputs_tensor,
-        input_count,
-        outputs_tensor,
-        output_count,
-        spinst->sp,
-        NULL);
-
-    if (node)
-    {
-        vxAssignNodeQueryCallback(node, times_query);
-    }
-
-    if (spinst)
-    {
-        vsi_nn_release_spinst(&spinst);
-    }
-
-    return (vsi_nn_kernel_node_t)node;
-}
-
-/*
-** This program requires sum operation in the z dimension.
-** Instead of using the SUM Engine, the sum needs to be performed
-** by Stream Processor instructions.
-*/
-vsi_nn_kernel_node_t softmax_z_direction
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    vsi_nn_tensor_t            ** outputs,
-    const vsi_nn_kernel_param_t * params
-    )
-{
-    vsi_nn_kernel_node_t node = NULL;
-    vsi_nn_tensor_attr_t attr;
-    vsi_nn_tensor_t * dummy_tensor[3] = {NULL};
-    vsi_nn_tensor_t * output_tensor[2] = {NULL};
-    int32_t axis = 2;
-    float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
-    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
-
-    memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
-    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
-    attr.is_const = FALSE;
-    attr.vtl = TRUE;
-    attr.is_dummy = TRUE;
-    attr.size[axis] = 1;
-    dummy_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( dummy_tensor[0], "Create dummy_tensor fail.", final );
-    dummy_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( dummy_tensor[1], "Create dummy_tensor fail.", final );
-    dummy_tensor[2] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( dummy_tensor[2], "Create dummy_tensor fail.", final );
-
-    memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
-    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
-    attr.is_const = FALSE;
-    attr.vtl = TRUE;
-    output_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( output_tensor[0], "Create tensor fail.", final );
-    output_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
-    CHECK_PTR_FAIL_GOTO( output_tensor[1], "Create tensor fail.", final );
-
-    node = vsi_nn_sp_max_axis2_node(graph, inputs[0], output_tensor[0], dummy_tensor[0]);
-    CHECK_PTR_FAIL_GOTO( node, "Create sp_max_axis2 fail.", final );
-    node = vsi_nn_sp_softmax_z_direction_exp_node(graph, output_tensor[0], dummy_tensor[0],
-        output_tensor[1], dummy_tensor[1], beta);
-    CHECK_PTR_FAIL_GOTO( node, "Create exp_y_direction fail.", final );
-    node = vsi_nn_sp_rcp_node(graph, dummy_tensor[1], dummy_tensor[2], output_scale);
-    CHECK_PTR_FAIL_GOTO( node, "Create sp_rcp fail.", final );
-    node = vsi_nn_sp_softmax_z_direction_times_node(graph, output_tensor[1], dummy_tensor[2], outputs[0]);
-    CHECK_PTR_FAIL_GOTO( node, "Create softmax_times fail.", final );
-
-final:
-    vsi_safe_release_tensor(dummy_tensor[0]);
-    vsi_safe_release_tensor(dummy_tensor[1]);
-    vsi_safe_release_tensor(dummy_tensor[2]);
-    vsi_safe_release_tensor(output_tensor[0]);
-    vsi_safe_release_tensor(output_tensor[1]);
-
-    return node;
-} /* softmax_z_direction() */
-
-#endif
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index aa47362..aa05c35 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -36,6 +36,7 @@
 #include "utils/vsi_nn_math.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "vsi_nn_tensor_util_prv.h"
 
 #include "libnnext/vsi_nn_libnnext_resource.h"
 #if VSI_USE_VXC_BINARY
@@ -119,13 +120,6 @@ static void _kernel_clear_source
 
 static vsi_bool _check_shader_support(vsi_nn_graph_t* graph);
 
-static vsi_bool _check_stream_process_support
-    (
-    vsi_nn_graph_t* graph,
-    vsi_nn_tensor_t** inputs,
-    size_t input_num
-    );
-
 vsi_bool vsi_nn_kernel_is_supported_types
     (
     vsi_nn_tensor_t** inputs,
@@ -303,6 +297,9 @@ static const uint8_t* _load_internal_executable
         case VSI_NN_KERNEL_TYPE_CL:
             return _load_bin( source_name, size,
                 vx_bin_resource_items_cl, vx_bin_resource_items_cl_cnt, "_cl" );
+        default:
+            VSILOGE("Unsupported source format %d", type);
+            break;
     }
 #endif
     return NULL;
@@ -321,7 +318,7 @@ static char* _load_source_code_from_file
     source = NULL;
     //TODO: Pack new name
     fp = vsi_nn_fopen( source_name, "rb" );
-    if( NULL == fp )
+    if ( NULL == fp )
     {
         VSILOGE("Open program file %s fail.", source_name);
         *size = 0;
@@ -330,17 +327,17 @@ static char* _load_source_code_from_file
     fseek( fp, 0, SEEK_END );
     total_bytes = ftell( fp );
     fseek( fp, 0, SEEK_SET );
-    if( total_bytes == 0 )
+    if ( total_bytes == 0 )
     {
         VSILOGE("Program file %s is empty.", source_name);
         *size = 0;
         goto final;
     }
     source = (char*)malloc( total_bytes + 1 );
-    if( source )
+    if ( source )
     {
         read_bytes = 0;
-        while( total_bytes - read_bytes > 0 )
+        while ( total_bytes - read_bytes > 0 )
         {
             read_bytes += fread( &source[read_bytes], 1, total_bytes - read_bytes, fp );
         }
@@ -348,7 +345,11 @@ static char* _load_source_code_from_file
         *size = read_bytes;
     }
 final:
-    if (fp) fclose( fp );
+    if (fp)
+    {
+        fclose( fp );
+    }
+
     return source;
 } /* _load_source_code_from_file() */
 
@@ -1222,7 +1223,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
     {
         uint32_t i;
         vsi_nn_kernel_type_e type;
-        vsi_nn_kernel_setup_func_t kernel_func = NULL;;
+        vsi_nn_kernel_setup_func_t kernel_func = NULL;
         for( i = 0; i < (uint32_t)selector.allow_kernel_num; i ++ )
         {
             type = selector.pirority[i].kernel_type;
@@ -1243,7 +1244,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
 
             /* Skip StreamProcesor if not support */
             if( type == VSI_NN_KERNEL_TYPE_SP &&
-                _check_stream_process_support(graph, inputs, input_num) == FALSE )
+                vsi_nn_is_stream_process_supported_types(graph, inputs, input_num) == FALSE )
             {
                 continue;
             }
@@ -1457,7 +1458,7 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create
         attr->scale = attr->asymm.scale;
         attr->zero_point = attr->asymm.zero_point;
     }
-    break;
+        break;
     default:
         attr->scale = 1.0f;
         break;
@@ -1468,21 +1469,21 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create
 void vsi_nn_kernel_tensor_attr_release
     ( vsi_nn_kernel_tensor_attr_t ** p_attr )
 {
-    if( p_attr && *p_attr )
+    if ( p_attr && *p_attr )
     {
         vsi_nn_kernel_tensor_attr_t * attr = *p_attr;
         vsi_size_array_release( &attr->shape );
-        if( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM_PERCHANNEL )
+        if ( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM_PERCHANNEL )
         {
             vsi_float_array_release( &attr->asymm_v.scale );
             vsi_int_array_release( &attr->asymm_v.zero_point );
         }
-        else if( attr->quant == VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL )
+        else if ( attr->quant == VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL )
         {
             //TODO:
         }
-        free( attr );
-        *p_attr = NULL;
+
+        vsi_nn_safe_free(*p_attr);
     }
 } /* vsi_nn_kernel_tensor_attr_release() */
 
@@ -1697,29 +1698,3 @@ vsi_bool vsi_nn_kernel_is_supported_types
 
     return TRUE;
 }
-
-static vsi_bool _check_stream_process_support
-    (
-    vsi_nn_graph_t* graph,
-    vsi_nn_tensor_t** inputs,
-    size_t input_num
-    )
-{
-    if ( graph->ctx->config.support_stream_processor == 0 )
-    {
-        return FALSE;
-    }
-
-    if ( graph->ctx->config.sp_exec_count == 0 )
-    {
-        return FALSE;
-    }
-
-    if (inputs && input_num > 0 &&
-        inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
-    {
-        return FALSE;
-    }
-
-    return TRUE;
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
index d78769e..f3a8f4f 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
@@ -75,7 +75,7 @@ static vsi_size_t element_fill_dim
     if (size_x == 1)
         return 0;
 
-    if ( size_x < GPU_TENSOR_MAX_WIDTH)
+    if ( size_x < max_rank)
     {
         shape_x[rank_x] = size_x;
     }
@@ -83,9 +83,9 @@ static vsi_size_t element_fill_dim
     {
         vsi_size_t divisor = 0;
         vsi_size_t remainder = 0;
-        compute_gpu_divisor( size_x, GPU_TENSOR_MAX_WIDTH, 1, &divisor );
+        compute_gpu_divisor( size_x, max_rank, 1, &divisor );
         remainder = size_x / divisor;
-        if ( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank)
+        if ( remainder > max_rank || rank_x >= max_rank)
         {
             // Cannot optimize.
             shape_x[rank_x] = size_x;
@@ -612,6 +612,41 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
     return TRUE;
 }
 
+static vsi_bool vsi_nn_kernel_optimize_element_shape_with_max_rank
+    (
+    const vsi_size_t* shape_x, const vsi_size_t rank_x,
+    vsi_size_t* out_shape_x, vsi_size_t* out_rank_x, vsi_size_t max_rank
+    )
+{
+    vsi_bool ret                        = TRUE;
+    uint32_t  i                         = 0;
+    vsi_size_t   rank_in                    = 0;
+    vsi_size_t  element_num                = 1;
+
+    for (i = 0; i < rank_x; i++)
+    {
+        element_num *= shape_x[i];
+    }
+
+    rank_in += element_fill_dim(out_shape_x, rank_in, max_rank, element_num);
+
+    if ( 0 == rank_in )
+    {
+        out_shape_x[0] = 1;
+        out_shape_x[1] = 1;
+        rank_in = 2;
+    }
+    else if ( 1 == rank_in )
+    {
+        out_shape_x[1] = 1;
+        rank_in = 2;
+    }
+
+    *out_rank_x = (size_t)rank_in;
+
+    return ret;
+} /* vsi_nn_kernel_optimize_element_shape() */
+
 vsi_bool vsi_nn_kernel_optimize_group_norm_shape
     (
     const vsi_size_t* shape, const uint32_t rank, int32_t groups,
@@ -622,11 +657,20 @@ vsi_bool vsi_nn_kernel_optimize_group_norm_shape
     uint32_t i = 0;
     vsi_size_t out_rank = 0;
     vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_size_t max_rank = GPU_TENSOR_MAX_WIDTH;
     group_shape[0] = shape[0];
     group_shape[1] = shape[1];
     group_shape[2] = shape[2] / groups;
 
-    vsi_nn_kernel_optimize_element_shape( group_shape, 3, out_shape, &out_rank );
+#define NN_INPUT_SIZE_MAX      ((1 << 13) - 1)
+    if (is_sp_kernel)
+    {
+        max_rank = NN_INPUT_SIZE_MAX;
+    }
+#undef NN_INPUT_SIZE_MAX
+
+    vsi_nn_kernel_optimize_element_shape_with_max_rank( group_shape, 3,
+        out_shape, &out_rank, max_rank);
 
     if (!is_sp_kernel && out_shape[1] == 1 && out_rank < 3)
     {
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
index dfdc3dd..b9f3ff2 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
@@ -221,7 +221,33 @@ static float linear_sigmoid_eval(float x, vsi_nn_kernel_lut_params *lut_param)
     float a = lut_param->params[0];
     float b = lut_param->params[1];
 
-    return 1.0f / (1 + expf(a * x + b));;
+    return 1.0f / (1 + expf(a * x + b));
+}
+
+static float atan_eval(float x)
+{
+    return atanf(x);
+}
+
+static float atanh_eval(float x)
+{
+    return (log_eval(1 + x) - log_eval(1 - x)) / 2;
+}
+
+static float acosh_eval(float x)
+{
+    return (log_eval(x + (float)sqrt(x * x - 1)));
+}
+
+static float inverse_sigmoid_eval(float x, vsi_nn_kernel_lut_params *lut_param)
+{
+    float eps = lut_param->params[0];
+    float x1, x2;
+    x = vsi_nn_clamp(x, 0, 1);
+    x1 = vsi_nn_clamp(x, eps, 1);
+    x2 = vsi_nn_clamp((1 - x), eps, 1);
+
+    return log_eval(x1 / x2);
 }
 
 static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
@@ -236,35 +262,27 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
     case VSI_NN_KERNEL_LUT_LOG:
         result =  log_eval(data);
         break;
-        break;
     case VSI_NN_KERNEL_LUT_EXP:
         result =  exp_eval(data);
         break;
-        break;
     case VSI_NN_KERNEL_LUT_SELU:
         result =  selu_eval(data, lut_param);
         break;
-        break;
     case VSI_NN_KERNEL_LUT_NEG:
         result =  neg_eval(data);
         break;
-        break;
     case VSI_NN_KERNEL_LUT_HSIGMOID:
         result =  hsigmoid_eval(data, lut_param);
         break;
-        break;
     case VSI_NN_KERNEL_LUT_SOFT_PLUS:
         result =  soft_plus_eval(data);
         break;
-        break;
     case VSI_NN_KERNEL_LUT_ERF:
         result =  erf_eval(data);
         break;
-        break;
     case VSI_NN_KERNEL_LUT_GELU:
         result =  gelu_eval(data);
         break;
-        break;
     case VSI_NN_KERNEL_LUT_HGELU:
         result =  hgelu_eval(data);
         break;
@@ -295,6 +313,18 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
     case VSI_NN_KERNEL_LUT_LINEAR_SIGMOID:
         result = linear_sigmoid_eval(data, lut_param);
         break;
+    case VSI_NN_KERNEL_LUT_ATAN:
+        result = atan_eval(data);
+        break;
+    case VSI_NN_KERNEL_LUT_ATANH:
+        result = atanh_eval(data);
+        break;
+    case VSI_NN_KERNEL_LUT_ACOSH:
+        result = acosh_eval(data);
+        break;
+    case VSI_NN_KERNEL_LUT_INVERSE_SIGMOID:
+        result = inverse_sigmoid_eval(data, lut_param);
+        break;
     default:
         VSILOGE( "unsupported activation function:%d", lut_param->act_type );
         break;
@@ -303,7 +333,7 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
     return result;
 }
 
-vsi_status vsi_nn_kernel_lut
+vsi_status vsi_nn_kernel_lut_positive
     (
     vx_lut index_lut,
     vx_lut output_lut,
@@ -313,6 +343,7 @@ vsi_status vsi_nn_kernel_lut
     vsi_status status = VSI_SUCCESS;
     vsi_nn_kernel_lut_t *lut = NULL;
     uint32_t i = 0;
+    float clamp_min = 0;
     float index[VSI_NN_KERNEL_LUT_MAX_SIZE] = {0};
     float value[VSI_NN_KERNEL_LUT_MAX_SIZE] = {0};
 
@@ -323,32 +354,35 @@ vsi_status vsi_nn_kernel_lut
 
     lut = (vsi_nn_kernel_lut_t *)calloc(VSI_NN_KERNEL_LUT_MAX_SIZE, sizeof(vsi_nn_kernel_lut_t));
     CHECK_PTR_FAIL_GOTO( lut, "Create LUT buffer fail.", final );
+    memset(lut, 0, sizeof(vsi_nn_kernel_lut_t) * VSI_NN_KERNEL_LUT_MAX_SIZE);
+
+    clamp_min = param->clamp_min;
 
     for ( i = 0; i < VSI_NN_KERNEL_LUT_MAX_SIZE; i++)
     {
-        int16_t val = (int16_t)(i << 6);
-        lut[i].index = fp16_to_fp32(val);
+        int16_t val = (int16_t)(i << 5);
+        float fidx = fp16_to_fp32(val);
+
+        if (val < 0)
+        {
+            continue;
+        }
+
+        if (param->pwl_sign_remove_support && fidx < clamp_min)
+        {
+            fidx = clamp_min;
+        }
+
+        lut[i].index = fidx;
         lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
     }
 
-    for (i = 0x0; i < 0x10; i++)
-    {
-        lut[i].index = 0;
-        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
-    }
-
-    for (i = 0x1F0; i < 0x200; i++)
+    for (i = 992; i < VSI_NN_KERNEL_LUT_MAX_SIZE; i++)
     {
         lut[i].index = VSI_NN_KERNEL_LUT_FP16_MAX;
         lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
     }
 
-    for (i = 0x3F0; i < 0x400; i++)
-    {
-        lut[i].index = VSI_NN_KERNEL_LUT_FP16_MIN;
-        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
-    }
-
     qsort(lut, VSI_NN_KERNEL_LUT_MAX_SIZE, sizeof(vsi_nn_kernel_lut_t), _comparator);
 
     for ( i = 0; i < VSI_NN_KERNEL_LUT_MAX_SIZE; i++)
@@ -364,3 +398,118 @@ final:
 
     return status;
 }
+
+vsi_status vsi_nn_kernel_lut_all
+    (
+    vx_lut index_lut,
+    vx_lut output_lut,
+    vsi_nn_kernel_lut_params *param
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_lut_t *lut = NULL;
+    uint32_t i = 0;
+    float clamp_min = 0;
+    float index[VSI_NN_KERNEL_LUT_MAX_SIZE] = {0};
+    float value[VSI_NN_KERNEL_LUT_MAX_SIZE] = {0};
+
+    if (index_lut == NULL || output_lut == NULL || param == NULL)
+    {
+        return VSI_FAILURE;
+    }
+
+    lut = (vsi_nn_kernel_lut_t *)calloc(VSI_NN_KERNEL_LUT_MAX_SIZE, sizeof(vsi_nn_kernel_lut_t));
+    CHECK_PTR_FAIL_GOTO( lut, "Create LUT buffer fail.", final );
+    memset(lut, 0, sizeof(vsi_nn_kernel_lut_t) * VSI_NN_KERNEL_LUT_MAX_SIZE);
+
+    clamp_min = param->clamp_min;
+
+    for ( i = 0; i < VSI_NN_KERNEL_LUT_MAX_SIZE; i++)
+    {
+        int16_t val = (int16_t)(i << 6);
+        float fidx = fp16_to_fp32(val);
+        if (param->pwl_sign_remove_support && fidx < clamp_min)
+        {
+            fidx = clamp_min;
+        }
+
+        lut[i].index = fidx;
+        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
+    }
+
+    for (i = 0x0; i < 0x10; i++)
+    {
+        float fidx = 0;
+        if (param->pwl_sign_remove_support && fidx < clamp_min)
+        {
+            fidx = clamp_min;
+        }
+
+        lut[i].index = fidx;
+        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
+    }
+
+    for (i = 0x1F0; i < 0x200; i++)
+    {
+        lut[i].index = VSI_NN_KERNEL_LUT_FP16_MAX;
+        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
+    }
+
+    for (i = 0x3F0; i < 0x400; i++)
+    {
+        if (param->pwl_sign_remove_support)
+        {
+            lut[i].index = clamp_min;
+        }
+        else
+        {
+            lut[i].index = VSI_NN_KERNEL_LUT_FP16_MIN;
+        }
+
+        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
+    }
+
+    qsort(lut, VSI_NN_KERNEL_LUT_MAX_SIZE, sizeof(vsi_nn_kernel_lut_t), _comparator);
+
+    for ( i = 0; i < VSI_NN_KERNEL_LUT_MAX_SIZE; i++)
+    {
+        index[i] = lut[i].index;
+        value[i] = lut[i].val;
+    }
+
+    status  = vxCopyLUT(index_lut, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+    status |= vxCopyLUT(output_lut, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+final:
+    vsi_nn_safe_free(lut);
+
+    return status;
+}
+
+vsi_status vsi_nn_kernel_lut
+    (
+    vx_lut index_lut,
+    vx_lut output_lut,
+    vsi_nn_kernel_lut_params *param
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    float clamp_min = 0;
+
+    if (param == NULL)
+    {
+        return VSI_FAILURE;
+    }
+
+    clamp_min = param->clamp_min;
+
+    if (param->pwl_sign_remove_support && clamp_min >= 0)
+    {
+        status = vsi_nn_kernel_lut_positive(index_lut, output_lut, param);
+    }
+    else
+    {
+        status = vsi_nn_kernel_lut_all(index_lut, output_lut, param);
+    }
+
+    return status;
+}
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index d27a5f6..7b0c6ca 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -137,5 +137,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(rcp)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(softsign)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_bilinear)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_nearest)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(atan)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(atanh)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(acosh)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(inverse_sigmoid)
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
index a7cc925..a1680ed 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@@ -821,3 +821,24 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
 
     return new_bias;
 }
+
+vsi_status vsi_nn_set_sp_kernel_name
+    (
+        vsi_nn_kernel_node_t node,
+        char* kernel_name
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    if (node == NULL || kernel_name == NULL)
+    {
+        return VSI_FAILURE;
+    }
+
+#if VX_STREAM_PROCESSOR_SUPPORT
+    status = vxSetNodeAttribute((vx_node)node, VX_NODE_SP_NAME, kernel_name, sizeof(kernel_name));
+#endif
+
+    return status;
+}
+
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
index fffb3aa..9e299da 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@@ -53,6 +53,8 @@ static vsi_nn_kernel_node_t _setup
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_lut_params lut_param;
 
+    memset(&lut_param, 0, sizeof(lut_param));
+
     lut_param.act_type = lut_type;
     if (lut_type == VSI_NN_KERNEL_LUT_RELU_KERAS)
     {
@@ -74,6 +76,11 @@ static vsi_nn_kernel_node_t _setup
     {
         lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "alpha" );
     }
+    else if (lut_type == VSI_NN_KERNEL_LUT_ACOSH)
+    {
+        lut_param.pwl_sign_remove_support = TRUE;
+        lut_param.clamp_min = 0;
+    }
 
     if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
          outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  )
@@ -148,6 +155,7 @@ REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip,         VSI_NN_KERNEL_LUT_CLIP )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( celu,         VSI_NN_KERNEL_LUT_CELU )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( rcp,          VSI_NN_KERNEL_LUT_RCP )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( softsign,     VSI_NN_KERNEL_LUT_SOFTSIGN )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( atan,         VSI_NN_KERNEL_LUT_ATAN )
 
 #undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/avg_pool3d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/avg_pool3d.cl
new file mode 100644
index 0000000..960f29f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/avg_pool3d.cl
@@ -0,0 +1,160 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+#define TENSOR_AVG_POOL3D(src_name, dst_name, src_type, dst_type,\
+                         readimage_type, conv_mode, writeimage_type) \
+__kernel void avg_pool3d_##src_name##to##dst_name ( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+                 int              ksize_x, \
+                 int              ksize_y, \
+                 int              ksize_z, \
+                 int              stride_x, \
+                 int              stride_y, \
+                 int              stride_z, \
+                 int              pad_left, \
+                 int              pad_top, \
+                 int              pad_front, \
+                 int              width, \
+                 int              height, \
+                 int              depth_in, \
+                 int              depth_out, \
+                 float            inputScale, \
+                 float            inputTail, \
+                 float            outputScale, \
+                 float            outputTail, \
+                 int              count_include_pad) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int offsetz = get_global_id(2); \
+    int offsetz2 = offsetz / depth_out * depth_in; \
+    int d, d2, h, w, count; \
+    float sum = 0; \
+    dst_type out_data = (dst_type)(0); \
+    src_type in_data; \
+    float in_f32, out_f32; \
+    int wstart = gidx * stride_x - pad_left; \
+    int hstart = gidy * stride_y - pad_top; \
+    int wend = min(wstart + ksize_x, width); \
+    int hend = min(hstart + ksize_y, height); \
+    int dstart, dend; \
+    int4 coord_in, coord_out; \
+    wstart = max(wstart, 0); \
+    hstart = max(hstart, 0); \
+    for (d2 = 0; d2 < depth_out; d2++) \
+    { \
+        dstart = d2 * stride_z - pad_front; \
+        dend = min(dstart + ksize_z, depth_in); \
+        dstart = max(dstart, 0); \
+        coord_out = (int4)(gidx, gidy, offsetz + d2, 0); \
+        sum = 0; \
+        count = 0; \
+        for (d = dstart; d < dend; d++) \
+        { \
+            for (h = hstart; h < hend; h++) \
+            { \
+                for (w = wstart; w < wend; w++) \
+                { \
+                    coord_in = (int4)(w, h, d + offsetz2, 0); \
+                    in_data = readimage_type(input, coord_in).x; \
+                    in_f32 = convert_float(in_data) * inputScale + inputTail; \
+                    sum += in_f32; \
+                    count++; \
+                } \
+            } \
+        } \
+        if (count_include_pad == 1) \
+        { \
+            count = ksize_x * ksize_y * ksize_z; \
+        } \
+        out_f32 = (sum / count) * outputScale + outputTail; \
+        out_data.x = conv_mode(out_f32); \
+        writeimage_type(output, coord_out, out_data); \
+    } \
+}
+
+TENSOR_AVG_POOL3D(F32, F32, float, float4, read_imagef, convert_float, write_imagef)
+TENSOR_AVG_POOL3D(F32, U32, float, uint4,  read_imagef, convert_uint,  write_imageui)
+TENSOR_AVG_POOL3D(F32, I32, float, int4,   read_imagef, convert_int,   write_imagei)
+
+TENSOR_AVG_POOL3D(U32, U32, uint, uint4,  read_imageui, convert_uint,  write_imageui)
+TENSOR_AVG_POOL3D(U32, F32, uint, float4, read_imageui, convert_float, write_imagef)
+TENSOR_AVG_POOL3D(U32, I32, uint, int4,   read_imageui, convert_int,   write_imagei)
+
+TENSOR_AVG_POOL3D(I32, I32, int, int4,    read_imagei, convert_int,   write_imagei)
+TENSOR_AVG_POOL3D(I32, F32, int, float4, read_imagei, convert_float, write_imagef)
+TENSOR_AVG_POOL3D(I32, U32, int, uint4,  read_imagei, convert_uint,  write_imageui)
+
+__kernel void avg_pool3d_BF16toBF16 (
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+                 int              ksize_x,
+                 int              ksize_y,
+                 int              ksize_z,
+                 int              stride_x,
+                 int              stride_y,
+                 int              stride_z,
+                 int              pad_left,
+                 int              pad_top,
+                 int              pad_front,
+                 int              width,
+                 int              height,
+                 int              depth_in,
+                 int              depth_out,
+                 float            inputScale,
+                 float            inputTail,
+                 float            outputScale,
+                 float            outputTail,
+                 int              count_include_pad)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int offsetz = get_global_id(2);
+    int offsetz2 = offsetz / depth_out * depth_in;
+    int d, d2, h, w, count;
+    float sum = 0;
+    uint4 out_data = (uint4)(0);
+    uint4 in_data;
+    float in_f32, out_f32;
+    int wstart = gidx * stride_x - pad_left;
+    int hstart = gidy * stride_y - pad_top;
+    int wend = min(wstart + ksize_x, width);
+    int hend = min(hstart + ksize_y, height);
+    int dstart, dend;
+    int4 coord_in, coord_out;
+    wstart = max(wstart, 0);
+    hstart = max(hstart, 0);
+    for (d2 = 0; d2 < depth_out; d2++)
+    {
+        dstart = d2 * stride_z - pad_front;
+        dend = min(dstart + ksize_z, depth_in);
+        dstart = max(dstart, 0);
+        coord_out = (int4)(gidx, gidy, offsetz + d2, 0);
+        sum = 0;
+        count = 0;
+        for (d = dstart; d < dend; d++)
+        {
+            for (h = hstart; h < hend; h++)
+            {
+                for (w = wstart; w < wend; w++)
+                {
+                    coord_in = (int4)(w, h, d + offsetz2, 0);
+                    in_data = read_imageui(input, coord_in).x;
+                    in_data = in_data << 16;
+                    _viv_asm(COPY, in_f32, in_data, 16);
+                    sum += in_f32;
+                    count++;
+                }
+            }
+        }
+        if (count_include_pad == 1)
+        {
+            count = ksize_x * ksize_y * ksize_z;
+        }
+        out_f32 = sum / count;
+        _viv_asm(COPY, out_data, out_f32, 4);
+        out_data.x = out_data.x >> 16;
+        write_imageui(output, coord_out, out_data);
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample.cl b/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample.cl
new file mode 100644
index 0000000..0d5d645
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample.cl
@@ -0,0 +1,111 @@
+__kernel void bilinear_grid_sample_F32_F32toF32(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                           float  half_input0_w,
+                           float  half_input0_h,
+                           float  add_float_value_w,
+                           float  add_float_value_h,
+                           int    depth
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));
+    int2   coord_add    = (int2)(-1, 1);
+
+    float fx = read_imagef(input1, coord_in1).x;
+    coord_in1.x = coord_in1.x + 1;
+    float fy = read_imagef(input1, coord_in1).x;
+
+    fx = fx * half_input0_w + add_float_value_w;
+    fy = fy * half_input0_h + add_float_value_h;
+    float x_f = floor(fx);
+    float y_f = floor(fy);
+    float x_lerp  = fx - x_f;
+    float y_lerp  = fy - y_f;
+    int   x_index = convert_int(x_f);
+    int   y_index = convert_int(y_f);
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);
+
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst;
+
+    while (coord_in.z < depth){
+        top_l    = read_imagef(input0, coord_in);
+        coord_in.y++;
+        bottom_l = read_imagef(input0, coord_in);
+        coord_in.x++;
+        bottom_r = read_imagef(input0, coord_in);
+        coord_in.y--;
+        top_r    = read_imagef(input0, coord_in);
+        top_r    = top_r - top_l;
+        top      = top_l + x_lerp * top_r;
+        bottom_r = bottom_r - bottom_l;
+        bottom   = bottom_l + x_lerp * bottom_r;
+        bottom   = bottom - top;
+        dst      = top + y_lerp * bottom;
+        write_imagef(output, coord_out, dst);
+        coord_in.xz = coord_in.xz + coord_add;
+        coord_out.z++;
+    }
+}
+
+
+__kernel void bilinear_grid_sample_U8_U8toU8(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                           float  half_input0_w,
+                           float  half_input0_h,
+                           float  add_float_value_w,
+                           float  add_float_value_h,
+                           int    depth,
+                           float  in0_scale,
+                           float  in0_tail,
+                           float  in1_scale,
+                           float  in1_tail,
+                           float  out_scale,
+                           float  out_tail
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));
+    int2   coord_add    = (int2)(-1, 1);
+
+    float fx    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+    coord_in1.x = coord_in1.x + 1;
+    float fy    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+
+    fx = fx * half_input0_w + add_float_value_w;
+    fy = fy * half_input0_h + add_float_value_h;
+    float x_f = floor(fx);
+    float y_f = floor(fy);
+    float x_lerp  = fx - x_f;
+    float y_lerp  = fy - y_f;
+    int   x_index = convert_int(x_f);
+    int   y_index = convert_int(y_f);
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);
+
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom;
+    uint4  dst;
+
+    while (coord_in.z < depth){
+        top_l    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        coord_in.y++;
+        bottom_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        coord_in.x++;
+        bottom_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        coord_in.y--;
+        top_r    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        top_r    = top_r - top_l;
+        top      = top_l + x_lerp * top_r;
+        bottom_r = bottom_r - bottom_l;
+        bottom   = bottom_l + x_lerp * bottom_r;
+        bottom   = bottom - top;
+        top      = top + y_lerp * bottom;
+        dst      = convert_uint4_rte(top * out_scale + out_tail);
+        write_imageui(output, coord_out, dst);
+        coord_in.xz = coord_in.xz + coord_add;
+        coord_out.z++;
+    }
+
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
index 0372981..de65186 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl
@@ -67,96 +67,101 @@ __kernel void cumsum_F32toF32_axis2(
     }
 }
 
-__kernel void cumsum_U8toU8_axis2(
-    __read_only image2d_array_t  input,
-    __write_only image2d_array_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int channel,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4 coord_out = coord;
-
-    uint4 sum = (uint4)(0);
-    uint4 dst = (uint4)(0);
-
-    float cnt = 0.0f;
-
-    if(exclusive && rev)
-    {
-        coord_out.z = channel - 1;
-        write_imageui(output, coord_out, dst);
-        for(coord.z = channel - 1; coord.z > 0; coord.z--)
-        {
-            uint4 data = read_imageui(input, coord);
-            coord_out.z--;
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord_out, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        coord_out.z = 0;
-        write_imageui(output, coord_out, dst);
-        for(coord.z = 0; coord.z < channel - 1; coord.z++)
-        {
-            uint4 data = read_imageui(input, coord);
-            coord_out.z++;
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord_out, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
-        {
-            uint4 data = read_imageui(input, coord);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord, dst);
-        }
-    }
-    else
-    {
-        for(coord.z = 0; coord.z < channel; coord.z++)
-        {
-            uint4 data = read_imageui(input, coord);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord, dst);
-        }
-    }
+#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \
+__kernel void cumsum_##name##toU8_axis2( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int channel, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type sum = (src_type)(0); \
+    uint4 dst = (uint4)(0); \
+ \
+    float cnt = 0.0f; \
+ \
+    if(exclusive && rev) \
+    { \
+        coord_out.z = channel - 1; \
+        write_imageui(output, coord_out, dst); \
+        for(coord.z = channel - 1; coord.z > 0; coord.z--) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            coord_out.z--; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord_out, dst); \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord_out.z = 0; \
+        write_imageui(output, coord_out, dst); \
+        for(coord.z = 0; coord.z < channel - 1; coord.z++) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            coord_out.z++; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord_out, dst); \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord, dst); \
+        } \
+    } \
+    else \
+    { \
+        for(coord.z = 0; coord.z < channel; coord.z++) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord, dst); \
+        } \
+    } \
 }
+CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)
+CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)
+
+
 
 __kernel void cumsum_F32toF32_axis1(
     __read_only image2d_array_t  input,
@@ -226,97 +231,101 @@ __kernel void cumsum_F32toF32_axis1(
     }
 }
 
-__kernel void cumsum_U8toU8_axis1(
-    __read_only image2d_array_t  input,
-    __write_only image2d_array_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int channel,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4 coord_out = coord;
-
-    uint4 sum = (uint4)(0);
-    uint4 dst = (uint4)(0);
-
-    float cnt = 0;
-
-    if(exclusive && rev)
-    {
-        coord_out.y = height - 1;
-        write_imageui(output, coord_out, dst);
-
-        for(coord.y = height - 1; coord.y > 0; coord.y--)
-        {
-            uint4 data = read_imageui(input, coord);
-            cnt += 1.0f;
-            coord_out.y--;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord_out, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        coord_out.y = 0;
-        write_imageui(output, coord_out, dst);
-        for(coord.y = 0; coord.y < height - 1; coord.y++)
-        {
-            uint4 data = read_imageui(input, coord);
-            cnt += 1.0f;
-            coord_out.y++;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord_out, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)
-        {
-            uint4 data = read_imageui(input, coord);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord, dst);
-        }
-    }
-    else
-    {
-        for(coord.y = 0; coord.y < height; coord.y++)
-        {
-            uint4 data = read_imageui(input, coord);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord, dst);
-        }
-    }
+#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \
+__kernel void cumsum_##name##toU8_axis1( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int channel, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type sum = (src_type)(0); \
+    uint4 dst = (uint4)(0); \
+ \
+    float cnt = 0; \
+ \
+    if(exclusive && rev) \
+    { \
+        coord_out.y = height - 1; \
+        write_imageui(output, coord_out, dst); \
+ \
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            cnt += 1.0f; \
+            coord_out.y--; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord_out, dst); \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord_out.y = 0; \
+        write_imageui(output, coord_out, dst); \
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            cnt += 1.0f; \
+            coord_out.y++; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord_out, dst); \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord, dst); \
+        } \
+    } \
+    else \
+    { \
+        for(coord.y = 0; coord.y < height; coord.y++) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord, dst); \
+        } \
+    } \
 }
+CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)
+CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)
+
 
 __kernel void cumsum_F32toF32_axis0(
     __read_only image2d_array_t  input,
@@ -386,93 +395,96 @@ __kernel void cumsum_F32toF32_axis0(
     }
 }
 
-__kernel void cumsum_U8toU8_axis0(
-    __read_only image2d_array_t  input,
-    __write_only image2d_array_t  output,
-    int axis,
-    int exclusive,
-    int rev,
-    int width,
-    int height,
-    int channel,
-    int input_zp,
-    float in_out_scale,
-    float in_out_zp_scale,
-    float output_zp
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4 coord_out = coord;
-
-    uint4 sum = (uint4)(0);
-    uint4 dst = (uint4)(0);
-
-    float cnt = 0;
-
-    if(exclusive && rev)
-    {
-        coord_out.x = width - 1;
-        write_imageui(output, coord_out, dst);
-        for(coord.x = width - 1; coord.x > 0; coord.x--)
-        {
-            uint4 data = read_imageui(input, coord);
-            coord_out.x--;
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord_out, dst);
-        }
-    }
-    else if(exclusive)
-    {
-        coord_out.x = 0;
-        write_imageui(output, coord_out, dst);
-        for(coord.x = 0; coord.x < width - 1; coord.x++)
-        {
-            uint4 data = read_imageui(input, coord);
-            coord_out.x++;
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord_out, dst);
-        }
-    }
-    else if(rev)
-    {
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)
-        {
-            uint4 data = read_imageui(input, coord);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord, dst);
-        }
-    }
-    else
-    {
-        for(coord.x = 0; coord.x < width; coord.x++)
-        {
-            uint4 data = read_imageui(input, coord);
-            cnt += 1.0f;
-            sum += data;
-
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;
-
-            dst.x = (uint)convert_int_rte(tmpSum);
-            write_imageui(output, coord, dst);
-        }
-    }
+#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \
+__kernel void cumsum_##name##toU8_axis0( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int channel, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type sum = (src_type)(0); \
+    uint4 dst = (uint4)(0); \
+ \
+    float cnt = 0; \
+ \
+    if(exclusive && rev) \
+    { \
+        coord_out.x = width - 1; \
+        write_imageui(output, coord_out, dst); \
+        for(coord.x = width - 1; coord.x > 0; coord.x--) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            coord_out.x--; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord_out, dst); \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord_out.x = 0; \
+        write_imageui(output, coord_out, dst); \
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            coord_out.x++; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord_out, dst); \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord, dst); \
+        } \
+    } \
+    else \
+    { \
+        for(coord.x = 0; coord.x < width; coord.x++) \
+        { \
+            src_type data = read_image_type(input, coord); \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \
+ \
+            dst.x = (uint)convert_int_rte(tmpSum); \
+            write_imageui(output, coord, dst); \
+        } \
+    } \
 }
+CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)
+CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
index caced34..5fec847 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl
@@ -154,6 +154,95 @@ __kernel void cumsum_U8toU8_axis1_2D(
     }
 }
 
+__kernel void cumsum_F32toU8_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float4 sum = (float4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0;
+
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        write_imageui(output, coord.zw, dst);
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            cnt += 1.0f;
+            coord.w--;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        write_imageui(output, coord.zw, dst);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            cnt += 1.0f;
+            coord.w++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+}
+
 __kernel void cumsum_F32toF32_axis0_2D(
     __read_only image2d_t  input,
     __write_only image2d_t  output,
@@ -312,3 +401,94 @@ __kernel void cumsum_U8toU8_axis0_2D(
         }
     }
 }
+
+__kernel void cumsum_F32toU8_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float4 sum = (float4)(0);
+    uint4 dst = (uint4)(0);
+
+    float cnt = 0.0f;
+
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        write_imageui(output, coord.zw, dst);
+        for(; coord.x > 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            coord.z--;
+            cnt += 1.0;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        write_imageui(output, coord.zw, dst);
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            cnt += 1.0f;
+            coord.z++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.zw, dst);
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            float4 data = read_imagef(input, coord.xy);
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            write_imageui(output, coord.xy, dst);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
index 65be20e..e836a48 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
@@ -151,6 +151,32 @@ float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)
     return val / (1.0f + fabs(val));
 }
 
+float eltwise_unary_atan(float x, float alpha, float beta)
+{
+    return atan(x);
+}
+
+float eltwise_unary_atanh(float x, float alpha, float beta)
+{
+    return atanh(x);
+}
+
+float eltwise_unary_acosh(float x, float alpha, float beta)
+{
+    return acosh(x);
+}
+
+float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)
+{
+    float x1, x2;
+    x = clamp(x, 0, 1);
+    x1 = x > alpha ? x : alpha;
+    x2 = 1 - x;
+    x2 = x2 > alpha ? x2 : alpha;
+    return log(x1 / x2);
+}
+
+
 #define ELTWISE_UNARY_F32_2D(func_name) \
 __kernel void func_name##_F32toF32_2D \
     ( \
@@ -188,6 +214,10 @@ ELTWISE_UNARY_F32_2D(celu)
 ELTWISE_UNARY_F32_2D(rcp)
 ELTWISE_UNARY_F32_2D(sign)
 ELTWISE_UNARY_F32_2D(softsign)
+ELTWISE_UNARY_F32_2D(atan)
+ELTWISE_UNARY_F32_2D(atanh)
+ELTWISE_UNARY_F32_2D(acosh)
+ELTWISE_UNARY_F32_2D(inverse_sigmoid)
 
 #define ELTWISE_UNARY_U8_2D(func_name) \
 __kernel void func_name##_U8toU8_2D \
@@ -227,6 +257,52 @@ ELTWISE_UNARY_U8_2D(celu)
 ELTWISE_UNARY_U8_2D(rcp)
 ELTWISE_UNARY_U8_2D(sign)
 ELTWISE_UNARY_U8_2D(softsign)
+ELTWISE_UNARY_U8_2D(atan)
+ELTWISE_UNARY_U8_2D(atanh)
+ELTWISE_UNARY_U8_2D(acosh)
+ELTWISE_UNARY_U8_2D(inverse_sigmoid)
+
+#define ELTWISE_UNARY_U8toF32_2D(func_name) \
+__kernel void func_name##_U8toF32_2D \
+    ( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+                 float     inputScale, \
+                 float     inputTail, \
+                 float     outputScale, \
+                 float     outputZP, \
+                 float     alpha, \
+                 float     beta \
+    ) \
+{ \
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    uint4 src = read_imageui(input, coord); \
+    float4 dst = convert_float4(src) * inputScale - inputTail; \
+ \
+    dst.x = eltwise_unary_##func_name(dst.x, alpha, beta); \
+ \
+    write_imagef(output, coord, dst); \
+}
+ELTWISE_UNARY_U8toF32_2D(sin)
+ELTWISE_UNARY_U8toF32_2D(cos)
+ELTWISE_UNARY_U8toF32_2D(exp)
+ELTWISE_UNARY_U8toF32_2D(log)
+ELTWISE_UNARY_U8toF32_2D(neg)
+ELTWISE_UNARY_U8toF32_2D(mish)
+ELTWISE_UNARY_U8toF32_2D(hard_sigmoid)
+ELTWISE_UNARY_U8toF32_2D(round)
+ELTWISE_UNARY_U8toF32_2D(gelu)
+ELTWISE_UNARY_U8toF32_2D(hard_gelu)
+ELTWISE_UNARY_U8toF32_2D(selu)
+ELTWISE_UNARY_U8toF32_2D(celu)
+ELTWISE_UNARY_U8toF32_2D(rcp)
+ELTWISE_UNARY_U8toF32_2D(sign)
+ELTWISE_UNARY_U8toF32_2D(softsign)
+ELTWISE_UNARY_U8toF32_2D(atan)
+ELTWISE_UNARY_U8toF32_2D(atanh)
+ELTWISE_UNARY_U8toF32_2D(acosh)
+ELTWISE_UNARY_U8toF32_2D(inverse_sigmoid)
 
 __kernel void neg_I32toI32_2D
     (
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
index 5a21ad8..2adf398 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
@@ -151,6 +151,30 @@ float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)
     return val / (1.0f + fabs(val));
 }
 
+float eltwise_unary_atan(float x, float alpha, float beta)
+{
+    return atan(x);
+}
+
+float eltwise_unary_atanh(float x, float alpha, float beta)
+{
+    return atanh(x);
+}
+float eltwise_unary_acosh(float x, float alpha, float beta)
+{
+    return acosh(x);
+}
+
+float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)
+{
+    float x1, x2;
+    x = clamp(x, 0, 1);
+    x1 = x > alpha ? x : alpha;
+    x2 = 1 - x;
+    x2 = x2 > alpha ? x2 : alpha;
+    return log(x1 / x2);
+}
+
 #define ELTWISE_UNARY_F32(func_name) \
 __kernel void func_name##_F32toF32 \
     ( \
@@ -188,6 +212,10 @@ ELTWISE_UNARY_F32(celu)
 ELTWISE_UNARY_F32(rcp)
 ELTWISE_UNARY_F32(sign)
 ELTWISE_UNARY_F32(softsign)
+ELTWISE_UNARY_F32(atan)
+ELTWISE_UNARY_F32(atanh)
+ELTWISE_UNARY_F32(acosh)
+ELTWISE_UNARY_F32(inverse_sigmoid)
 
 #define ELTWISE_UNARY_U8(func_name) \
 __kernel void func_name##_U8toU8 \
@@ -227,6 +255,52 @@ ELTWISE_UNARY_U8(celu)
 ELTWISE_UNARY_U8(rcp)
 ELTWISE_UNARY_U8(sign)
 ELTWISE_UNARY_U8(softsign)
+ELTWISE_UNARY_U8(atan)
+ELTWISE_UNARY_U8(atanh)
+ELTWISE_UNARY_U8(acosh)
+ELTWISE_UNARY_U8(inverse_sigmoid)
+
+#define ELTWISE_UNARY_U8toF32(func_name) \
+__kernel void func_name##_U8toF32 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float           inputScale, \
+                 float           inputTail, \
+                 float           outputScale, \
+                 float           outputZP, \
+                 float           alpha, \
+                 float           beta \
+    ) \
+{ \
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    uint4 src = read_imageui(input, coord); \
+    float4 dst = convert_float4(src) * inputScale - inputTail; \
+ \
+    dst.x = eltwise_unary_##func_name(dst.x, alpha, beta); \
+ \
+    write_imagef(output, coord, dst); \
+}
+ELTWISE_UNARY_U8toF32(sin)
+ELTWISE_UNARY_U8toF32(cos)
+ELTWISE_UNARY_U8toF32(exp)
+ELTWISE_UNARY_U8toF32(log)
+ELTWISE_UNARY_U8toF32(neg)
+ELTWISE_UNARY_U8toF32(mish)
+ELTWISE_UNARY_U8toF32(hard_sigmoid)
+ELTWISE_UNARY_U8toF32(round)
+ELTWISE_UNARY_U8toF32(gelu)
+ELTWISE_UNARY_U8toF32(hard_gelu)
+ELTWISE_UNARY_U8toF32(selu)
+ELTWISE_UNARY_U8toF32(celu)
+ELTWISE_UNARY_U8toF32(rcp)
+ELTWISE_UNARY_U8toF32(sign)
+ELTWISE_UNARY_U8toF32(softsign)
+ELTWISE_UNARY_U8toF32(atan)
+ELTWISE_UNARY_U8toF32(atanh)
+ELTWISE_UNARY_U8toF32(acosh)
+ELTWISE_UNARY_U8toF32(inverse_sigmoid)
 
 __kernel void neg_I32toI32
     (
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl
new file mode 100644
index 0000000..15a4664
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_array.cl
@@ -0,0 +1,111 @@
+__kernel void gather_array_U8toU8(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);
+    int4 indice = read_imagei(input1, coord_in.xy);
+    coord_in.w = gidz * axis_num + indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    __global uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
+    uchar data = input_ptr[0];
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+    __global uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    output_ptr[0] = data;
+}
+
+__kernel void gather_array_F16toF16(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);
+    int4 indice = read_imagei(input1, coord_in.xy);
+    coord_in.w = gidz * axis_num + indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    __global short* input_ptr = (__global short*)get_image_ptr_from_coord(img1, coord_in.zw);
+    short data = input_ptr[0];
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+    __global short* output_ptr = (__global short*)get_image_ptr_from_coord(img2, coord);
+    output_ptr[0] = data;
+}
+
+__kernel void gather_array_I32toI32(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);
+    int4 indice = read_imagei(input1, coord_in.xy);
+    coord_in.w = gidz * axis_num + indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    __global int* input_ptr = (__global int*)get_image_ptr_from_coord(img1, coord_in.zw);
+    int data = input_ptr[0];
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+    __global int* output_ptr = (__global int*)get_image_ptr_from_coord(img2, coord);
+    output_ptr[0] = data;
+}
+
+__kernel void gather_array_F32toF32(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num,
+    int indices_num,
+    int batch
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);
+    int4 indice = read_imagei(input1, coord_in.xy);
+    coord_in.w = gidz * axis_num + indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    __global float* input_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_in.zw);
+    float data = input_ptr[0];
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+    __global float* output_ptr = (__global float*)get_image_ptr_from_coord(img2, coord);
+    output_ptr[0] = data;
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl
new file mode 100644
index 0000000..02e4309
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/gather_nd_batch.cl
@@ -0,0 +1,124 @@
+__kernel void gather_nd_batch_U8toU8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch_num
+
+    int4 coord = (int4)(gidx, gidy, 0, 0);
+    int4 indice = read_imagei(input1, coord.wy);
+    coord.z = indice.x * block_size + gidx;
+
+    uint4 data = read_imageui(input0, coord.zy);
+    write_imageui(output, coord.xy, data);
+}
+
+__kernel void gather_nd_batch_F16toF16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch_num
+
+    int4 coord = (int4)(gidx, gidy, 0, 0);
+    int4 indice = read_imagei(input1, coord.wy);
+    coord.z = indice.x * block_size + gidx;
+
+    float4 data = read_imagef(input0, coord.zy);
+    write_imagef(output, coord.xy, data);
+}
+
+__kernel void gather_nd_batch_I8toI8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch_num
+
+    int4 coord = (int4)(gidx, gidy, 0, 0);
+    int4 indice = read_imagei(input1, coord.wy);
+    coord.z = indice.x * block_size + gidx;
+
+    int4 data = read_imagei(input0, coord.zy);
+    write_imagei(output, coord.xy, data);
+}
+
+//2D
+__kernel void gather_nd_batch_U8toU8_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch_num
+
+    int4 coord = (int4)(0, gidy, gidx, 1);
+    int4 indice = read_imagei(input1, coord.xy);
+    int4 indice1 = read_imagei(input1, coord.wy);
+    indice.x = indice.x * block_size + gidx;
+    indice.y = indice1.x;
+    indice.zw = coord.yx;
+
+    uint4 data = read_imageui(input0, indice);
+    write_imageui(output, coord.zy, data);
+}
+
+__kernel void gather_nd_batch_F16toF16_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch_num
+
+    int4 coord = (int4)(0, gidy, gidx, 1);
+    int4 indice = read_imagei(input1, coord.xy);
+    int4 indice1 = read_imagei(input1, coord.wy);
+    indice.x = indice.x * block_size + gidx;
+    indice.y = indice1.x;
+    indice.zw = coord.yx;
+
+    float4 data = read_imagef(input0, indice);
+    write_imagef(output, coord.zy, data);
+}
+
+__kernel void gather_nd_batch_I8toI8_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch_num
+
+    int4 coord = (int4)(0, gidy, gidx, 1);
+    int4 indice = read_imagei(input1, coord.xy);
+    int4 indice1 = read_imagei(input1, coord.wy);
+    indice.x = indice.x * block_size + gidx;
+    indice.y = indice1.x;
+    indice.y = indice1.x;
+    indice.zw = coord.yx;
+
+    int4 data = read_imagei(input0, indice);
+    write_imagei(output, coord.zy, data);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/globallppool.cl b/src/tim/vx/internal/src/libnnext/ops/cl/globallppool.cl
new file mode 100644
index 0000000..30771a9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/globallppool.cl
@@ -0,0 +1,89 @@
+
+#define GLOBALLPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type) \
+    int gidx = get_global_id(0); \
+    int4 coord_out = (int4)(0, 0, gidx, 0); \
+    int4 coord_in  = coord_out; \
+    int h, w; \
+    float sum_of_pow = 0; \
+    dst_type out_data = (dst_type)(0); \
+    src_type in_data; \
+    float in_f32, out_f32; \
+    for (h = 0; h < height; h++) \
+    { \
+        for (w = 0; w < width; w++) \
+        { \
+            coord_in.xy = (int2)(w, h); \
+            in_data = readimage_type(input, coord_in).x; \
+            in_f32 = convert_float(in_data) * inputScale + inputTail; \
+            sum_of_pow += pow(fabs(in_f32),p); \
+        } \
+    } \
+    out_f32 = pow(sum_of_pow, 1.0f / p) * outputScale + outputTail; \
+    out_data.x = conv_mode(out_f32); \
+    writeimage_type(output, coord_out, out_data); \
+
+#define TENSOR_GLOBALLPPOOL(src_name, dst_name, src_type, dst_type, readimage_type, conv_mode, writeimage_type) \
+__kernel void globallppool_##src_name##to##dst_name ( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+                 int              p, \
+                 int              width, \
+                 int              height, \
+                 float            inputScale, \
+                 float            inputTail, \
+                 float            outputScale, \
+                 float            outputTail) \
+{ \
+    GLOBALLPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type); \
+}
+
+TENSOR_GLOBALLPPOOL(F32, F32, float, float4, read_imagef, convert_float, write_imagef)
+TENSOR_GLOBALLPPOOL(F32, U32, float, uint4,  read_imagef, convert_uint,  write_imageui)
+TENSOR_GLOBALLPPOOL(F32, I32, float, int4,   read_imagef, convert_int,   write_imagei)
+
+TENSOR_GLOBALLPPOOL(U32, U32, uint, uint4,  read_imageui, convert_uint,  write_imageui)
+TENSOR_GLOBALLPPOOL(U32, F32, uint, float4, read_imageui, convert_float, write_imagef)
+TENSOR_GLOBALLPPOOL(U32, I32, uint, int4,   read_imageui, convert_int,   write_imagei)
+
+TENSOR_GLOBALLPPOOL(I32, I32, int, int4,    read_imagei, convert_int,   write_imagei)
+TENSOR_GLOBALLPPOOL(I32, F32, int, float4, read_imagei, convert_float, write_imagef)
+TENSOR_GLOBALLPPOOL(I32, U32, int, uint4,  read_imagei, convert_uint,  write_imageui)
+
+__kernel void globallppool_BF16toBF16(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+                 int              p,
+                 int              width,
+                 int              height,
+                 float            inputScale,
+                 float            inputTail,
+                 float            outputScale,
+                 float            outputTail)
+{
+    int gidx = get_global_id(0);
+    int4 coord_out = (int4)(1, 1, gidx , 0);
+    int4 coord_in  = coord_out;
+    int h, w;
+    float sum_of_pow = 0;
+    float out_data_f32 = 0;
+    uint4 dst = (uint4)(0);
+    float4 data_f32 = (float4)(0);
+    uint4 data;
+
+    for (h = 0; h < height; h++)
+    {
+        for (w = 0; w < width; w++)
+        {
+            coord_in.xy = (int2)(w, h);
+            data = read_imageui(input, coord_in);
+            data = data << 16;
+            _viv_asm(COPY, data_f32, data, 16);
+            sum_of_pow += pow(abs(data_f32.x),p);
+        }
+    }
+    out_data_f32 = pow(sum_of_pow, 1.0f / p);
+    _viv_asm(COPY, dst, out_data_f32, 4);
+    dst.x = dst.x >> 16;
+    write_imageui(output, coord_out, dst);
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl
index 12b6243..e6f4b3f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl
@@ -14,7 +14,6 @@ __kernel void instance_norm_sums_I32(
     int4 coord = (int4)(gidx, 0, gidz, 0);
     int4 data;
     float2 sum_x_x2 = 0;
-    int2 _sum_x_x2 = 0;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
@@ -25,10 +24,10 @@ __kernel void instance_norm_sums_I32(
         {
             data = read_imagei(input, coord);
             coord.y++;
-            _sum_x_x2.x = _sum_x_x2.x + data.x;
-            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;
+            float in = convert_float(data.x);
+            sum_x_x2.x = sum_x_x2.x + in;
+            sum_x_x2.y = sum_x_x2.y + in * in;
         }
-        sum_x_x2 = convert_float2(_sum_x_x2);
     }
     lcl_sum[lidx] = sum_x_x2.x;
     lcl_sqr[lidx] = sum_x_x2.y;
@@ -74,7 +73,6 @@ __kernel void instance_norm_sums_I32_2D(
     int2 coord = (int2)(gidx, gidy);
     int4 data;
     float2 sum_x_x2 = 0;
-    int2 _sum_x_x2 = 0;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
@@ -86,10 +84,10 @@ __kernel void instance_norm_sums_I32_2D(
         {
             data = read_imagei(input, coord);
             coord.y++;
-            _sum_x_x2.x = _sum_x_x2.x + data.x;
-            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;
+            float in = convert_float(data.x);
+            sum_x_x2.x = sum_x_x2.x + in;
+            sum_x_x2.y = sum_x_x2.y + in * in;
         }
-        sum_x_x2 = convert_float2(_sum_x_x2);
     }
     lcl_sum[lidx] = sum_x_x2.x;
     lcl_sqr[lidx] = sum_x_x2.y;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/l1norm.cl b/src/tim/vx/internal/src/libnnext/ops/cl/l1norm.cl
new file mode 100644
index 0000000..d8758c6
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/l1norm.cl
@@ -0,0 +1,265 @@
+#define eps 1e-12
+
+#define TENSOR_L1NORM_axis0(src_name, dst_name, src_type, dst_type, \
+                      readimage_type, conv_mode, writeimage_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l1norm_##src_name##to##dst_name##_axis0( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int lidx = get_local_id(0); \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    src_type src; \
+    dst_type dst; \
+    float4 src_f, dst_f; \
+    float sum = 0; \
+    float rcp_sum = 0; \
+    int4 coord= (int4)(gidx, gidy, gidz, 0); \
+    __local float lcl_sum[16]; \
+    for (; coord.x < axis_size; coord.x += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        sum += fabs(src_f.x); \
+    } \
+    lcl_sum[lidx] = sum; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \
+    float4 one = (float4)(1, 1, 1, 1); \
+    float4 data0; \
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \
+    rcp_sum =  1 / (dot(data0, one) + eps); \
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        dst_f = src_f * rcp_sum; \
+        dst = conv_mode(dst_f * outputscale + outputtail); \
+        writeimage_type(output, coord, dst); \
+    } \
+}
+
+TENSOR_L1NORM_axis0(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)
+TENSOR_L1NORM_axis0(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)
+TENSOR_L1NORM_axis0(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)
+TENSOR_L1NORM_axis0(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)
+TENSOR_L1NORM_axis0(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)
+TENSOR_L1NORM_axis0(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)
+TENSOR_L1NORM_axis0(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)
+
+#define TENSOR_L1NORM_axis1(src_name, dst_name, src_type, dst_type, \
+                      readimage_type, conv_mode, writeimage_type) \
+__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l1norm_##src_name##to##dst_name##_axis1( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int lidy = get_local_id(1); \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    src_type src; \
+    dst_type dst; \
+    float4 src_f, dst_f; \
+    float sum = 0; \
+    float rcp_sum = 0; \
+    int4 coord= (int4)(gidx, gidy, gidz, 0); \
+    __local float lcl_sum[16]; \
+    for (; coord.y < axis_size; coord.y += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        sum += fabs(src_f.x); \
+    } \
+    lcl_sum[lidy] = sum; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \
+    float4 one = (float4)(1, 1, 1, 1); \
+    float4 data0; \
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \
+    rcp_sum =  1 / (dot(data0, one) + eps); \
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        dst_f = src_f * rcp_sum; \
+        dst = conv_mode(dst_f * outputscale + outputtail); \
+        writeimage_type(output, coord, dst); \
+    } \
+}
+
+TENSOR_L1NORM_axis1(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)
+TENSOR_L1NORM_axis1(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)
+TENSOR_L1NORM_axis1(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)
+TENSOR_L1NORM_axis1(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)
+TENSOR_L1NORM_axis1(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)
+TENSOR_L1NORM_axis1(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)
+TENSOR_L1NORM_axis1(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)
+
+#define TENSOR_L1NORM_axis2(src_name, dst_name, src_type, dst_type, \
+                      readimage_type, conv_mode, writeimage_type) \
+__kernel __attribute__((reqd_work_group_size(1, 1, 16))) void l1norm_##src_name##to##dst_name##_axis2( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int lidz = get_local_id(2); \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    src_type src; \
+    dst_type dst; \
+    float4 src_f, dst_f; \
+    float sum = 0; \
+    float rcp_sum = 0; \
+    int4 coord= (int4)(gidx, gidy, gidz, 0); \
+    __local float lcl_sum[16]; \
+    for (; coord.z < axis_size; coord.z += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        sum += fabs(src_f.x); \
+    } \
+    lcl_sum[lidz] = sum; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \
+    float4 one = (float4)(1, 1, 1, 1); \
+    float4 data0; \
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \
+    rcp_sum =  1 / (dot(data0, one) + eps); \
+    for (coord.z = gidz; coord.z < axis_size; coord.z += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        dst_f = src_f * rcp_sum; \
+        dst = conv_mode(dst_f * outputscale + outputtail); \
+        writeimage_type(output, coord, dst); \
+    } \
+}
+
+TENSOR_L1NORM_axis2(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)
+TENSOR_L1NORM_axis2(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)
+TENSOR_L1NORM_axis2(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)
+TENSOR_L1NORM_axis2(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)
+TENSOR_L1NORM_axis2(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)
+TENSOR_L1NORM_axis2(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)
+TENSOR_L1NORM_axis2(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)
+
+#define TENSOR_L1NORM_2D_axis0(src_name, dst_name, src_type, dst_type,\
+          readimage_type, conv_mode, writeimage_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l1norm_##src_name##to##dst_name##_2D_axis0( \
+    __read_only  image2d_t        input, \
+    __write_only image2d_t        output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int lidx = get_local_id(0); \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    src_type src; \
+    dst_type dst; \
+    float4 src_f, dst_f; \
+    float sum = 0; \
+    float rcp_sum = 0; \
+    int2 coord = (int2)(gidx, gidy); \
+    __local float lcl_sum[16]; \
+    for (; coord.x < axis_size; coord.x += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        sum += fabs(src_f.x); \
+    } \
+    lcl_sum[lidx] = sum; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \
+    float4 one = (float4)(1, 1, 1, 1); \
+    float4 data0; \
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \
+    rcp_sum = 1 / (dot(data0, one) + eps); \
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        dst_f = src_f * rcp_sum; \
+        dst = conv_mode(dst_f * outputscale + outputtail); \
+        writeimage_type(output, coord, dst); \
+    } \
+}
+
+TENSOR_L1NORM_2D_axis0(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)
+TENSOR_L1NORM_2D_axis0(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)
+TENSOR_L1NORM_2D_axis0(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)
+TENSOR_L1NORM_2D_axis0(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)
+TENSOR_L1NORM_2D_axis0(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)
+TENSOR_L1NORM_2D_axis0(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)
+TENSOR_L1NORM_2D_axis0(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)
+
+
+#define TENSOR_L1NORM_2D_axis1(src_name, dst_name, src_type, dst_type,\
+             readimage_type, conv_mode, writeimage_type) \
+__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l1norm_##src_name##to##dst_name##_2D_axis1( \
+    __read_only  image2d_t        input, \
+    __write_only image2d_t        output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int lidy = get_local_id(1); \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    src_type src; \
+    dst_type dst; \
+    float4 src_f, dst_f; \
+    float sum = 0; \
+    float rcp_sum = 0; \
+    int2 coord = (int2)(gidx, gidy); \
+    __local float lcl_sum[16]; \
+    for (; coord.y < axis_size; coord.y += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        sum += fabs(src_f.x); \
+    } \
+    lcl_sum[lidy] = sum; \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \
+    float4 one = (float4)(1, 1, 1, 1); \
+    float4 data0; \
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \
+    rcp_sum = 1 / (dot(data0, one) + eps); \
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16) \
+    { \
+        src = readimage_type(input, coord); \
+        src_f = convert_float4(src) - inputZp; \
+        dst_f = src_f * rcp_sum; \
+        dst = conv_mode(dst_f * outputscale + outputtail); \
+        writeimage_type(output, coord, dst); \
+    } \
+}
+
+TENSOR_L1NORM_2D_axis1(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)
+TENSOR_L1NORM_2D_axis1(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)
+TENSOR_L1NORM_2D_axis1(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)
+TENSOR_L1NORM_2D_axis1(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)
+TENSOR_L1NORM_2D_axis1(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)
+TENSOR_L1NORM_2D_axis1(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)
+TENSOR_L1NORM_2D_axis1(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl
index 8a8b113..af5fa52 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis0.cl
@@ -34,6 +34,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
     for (coord.x = gidx; coord.x < axis_size; coord.x += 16)
     {
+        coord_scale.x = coord.x;
         src         = read_imagef(input, coord);
         scale_value = read_imagef(scale, coord_scale);
         result      = src * rsqrt_sum * scale_value;
@@ -76,6 +77,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
     for (coord.x = gidx; coord.x < axis_size; coord.x += 16)
     {
+        coord_scale.x = coord.x;
         src         = convert_float4(read_imageui(input, coord))  * inputScale + inputTail;
         scale_value = read_imagef(scale, coord_scale);
         result      = src * rsqrt_sum * scale_value;
@@ -119,6 +121,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
     for (coord.x = gidx; coord.x < axis_size; coord.x += 16)
     {
+        coord_scale.x = coord.x;
         src         = convert_float4(read_imagei(input, coord))  * inputScale + inputTail;
         scale_value = read_imagef(scale, coord_scale);
         result      = src * rsqrt_sum * scale_value;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl
index 9bfc07e..15dc380 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/l2normalizescale_axis1.cl
@@ -34,6 +34,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
     for (coord.y = gidy; coord.y < axis_size; coord.y += 16)
     {
+        coord_scale.x = coord.y;
         src         = read_imagef(input, coord);
         scale_value = read_imagef(scale, coord_scale);
         result      = src * rsqrt_sum * scale_value;
@@ -76,6 +77,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
     for (coord.y = gidy; coord.y < axis_size; coord.y += 16)
     {
+        coord_scale.x = coord.y;
         src         = convert_float4(read_imageui(input, coord))  * inputScale + inputTail;
         scale_value = read_imagef(scale, coord_scale);
         result      = src * rsqrt_sum * scale_value;
@@ -119,6 +121,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));
     for (coord.y = gidy; coord.y < axis_size; coord.y += 16)
     {
+        coord_scale.x = coord.y;
         src         = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;
         scale_value = read_imagef(scale, coord_scale);
         result      = src * rsqrt_sum * scale_value;
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
index 9efcd9e..8f601c6 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
@@ -221,3 +221,190 @@ __kernel void gemm_transb_F32I8toF32_3D(
     coord_a.z = get_global_id(2);
     write_imagef(output, coord_a, sum);
 }
+
+#define GEMM_2D(name, dst_type, read_image_type, convert_type, write_image_type) \
+__kernel void gemm_##name##_2D( \
+    __read_only image2d_t   inputA, \
+    __read_only image2d_t   inputB, \
+    __write_only image2d_t  output, \
+    int M, \
+    int K, \
+    int N, \
+    int ac2zero, \
+    int bc2zero, \
+    float scale_a, \
+    float zp_a, \
+    float scale_b, \
+    float zp_b, \
+    float scale_out, \
+    float zp_out \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+    float4 sum = (float4)(0); \
+    dst_type dst; \
+\
+    for(; coord.z < K;) \
+    { \
+        float4 tempA0; \
+        float4 tempB0; \
+        tempA0 = convert_float4(read_image_type(inputA, coord.zy)); \
+        tempB0 = convert_float4(read_image_type(inputB, coord.xz)); \
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \
+        coord.z++; \
+        sum = sum + tempA0 * tempB0; \
+    } \
+    sum.x = sum.x * scale_out + zp_out; \
+    dst = convert_type(sum); \
+ \
+    write_image_type(output, coord.xy, dst); \
+}
+GEMM_2D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);
+GEMM_2D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);
+GEMM_2D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);
+
+
+#define GEMM_3D(name, dst_type, read_image_type, convert_type, write_image_type) \
+__kernel void gemm_##name##_3D( \
+    __read_only image2d_array_t   inputA, \
+    __read_only image2d_array_t   inputB, \
+    __write_only image2d_array_t  output, \
+    int M, \
+    int K, \
+    int N, \
+    int ac2zero, \
+    int bc2zero, \
+    float scale_a, \
+    float zp_a, \
+    float scale_b, \
+    float zp_b, \
+    float scale_out, \
+    float zp_out \
+    ) \
+{ \
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); \
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \
+    float4 sum = (float4)(0); \
+    dst_type dst; \
+ \
+    for(; coord_a.x < K;) \
+    { \
+        float4 tempA0; \
+        float4 tempB0; \
+ \
+        tempA0 = convert_float4(read_image_type(inputA, coord_a)); \
+        tempB0 = convert_float4(read_image_type(inputB, coord_b)); \
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \
+ \
+        coord_a.x++; \
+        coord_b.y++; \
+ \
+        sum = sum + tempA0 * tempB0; \
+    } \
+    sum.x = sum.x * scale_out + zp_out; \
+    dst = convert_type(sum); \
+ \
+    coord_b.y = get_global_id(1); \
+    coord_b.z = get_global_id(2); \
+    write_image_type(output, coord_b, dst); \
+}
+GEMM_3D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);
+GEMM_3D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);
+GEMM_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);
+
+#define GEMM_TRANSB_2D(name, dst_type, read_image_type, convert_type, write_image_type) \
+__kernel void gemm_transb_##name##_2D( \
+    __read_only image2d_t   inputA, \
+    __read_only image2d_t   inputB, \
+    __write_only image2d_t  output, \
+    int M, \
+    int K, \
+    int N, \
+    int ac2zero, \
+    int bc2zero, \
+    float scale_a, \
+    float zp_a, \
+    float scale_b, \
+    float zp_b, \
+    float scale_out, \
+    float zp_out \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+    float4 sum = (float4)(0); \
+    dst_type dst; \
+ \
+    for(; coord.z < K;) \
+    { \
+        float4 tempA0; \
+        float4 tempB0; \
+ \
+        tempA0 = convert_float4(read_image_type(inputA, coord.zy)); \
+        tempB0 = convert_float4(read_image_type(inputB, coord.zx)); \
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \
+        coord.z++; \
+ \
+        sum = sum + tempA0 * tempB0; \
+    } \
+    sum.x = sum.x * scale_out + zp_out; \
+    dst = convert_type(sum); \
+ \
+    write_image_type(output, coord.xy, dst); \
+}
+GEMM_TRANSB_2D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);
+GEMM_TRANSB_2D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);
+GEMM_TRANSB_2D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);
+
+
+#define GEMM_TRANSB_3D(name, dst_type, read_image_type, convert_type, write_image_type) \
+__kernel void gemm_transb_##name##_3D( \
+    __read_only image2d_array_t   inputA, \
+    __read_only image2d_array_t   inputB, \
+    __write_only image2d_array_t  output, \
+    int M, \
+    int K, \
+    int N, \
+    int ac2zero, \
+    int bc2zero, \
+    float scale_a, \
+    float zp_a, \
+    float scale_b, \
+    float zp_b, \
+    float scale_out, \
+    float zp_out \
+    ) \
+{ \
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); \
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0); \
+ \
+    float4 sum = (float4)(0); \
+    dst_type dst; \
+ \
+    for(; coord_a.x < K;) \
+    { \
+        float4 tempA0; \
+        float4 tempB0; \
+ \
+        tempA0 = convert_float4(read_image_type(inputA, coord_a)); \
+        tempB0 = convert_float4(read_image_type(inputB, coord_b)); \
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \
+        coord_a.x++; \
+        coord_b.x++; \
+ \
+        sum = sum + tempA0 * tempB0; \
+    } \
+    sum.x = sum.x * scale_out + zp_out; \
+    dst = convert_type(sum); \
+ \
+    coord_a.x = get_global_id(0); \
+    coord_a.z = get_global_id(2); \
+    write_image_type(output, coord_a, dst); \
+}
+GEMM_TRANSB_3D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);
+GEMM_TRANSB_3D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);
+GEMM_TRANSB_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl
index b7bc8ee..accfe8f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_transA.cl
@@ -74,3 +74,99 @@ __kernel void gemm_transa_F32F32toF32_3D(
     coord_b.z = get_global_id(2);
     write_imagef(output, coord_b, sum);
 }
+
+#define GEMM_TRANSA_2D(name, dst_type, read_image_type, convert_type, write_image_type) \
+__kernel void gemm_transa_##name##_2D( \
+    __read_only image2d_t   inputA, \
+    __read_only image2d_t   inputB, \
+    __write_only image2d_t  output, \
+    int M, \
+    int K, \
+    int N, \
+    int ac2zero, \
+    int bc2zero, \
+    float scale_a, \
+    float zp_a, \
+    float scale_b, \
+    float zp_b, \
+    float scale_out, \
+    float zp_out \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+    float4 sum = (float4)(0); \
+    dst_type dst; \
+ \
+    for(; coord.z < K;) \
+    { \
+        float4 tempA0; \
+        float4 tempB0; \
+ \
+        tempA0 = convert_float4(read_image_type(inputA, coord.yz)); \
+        tempB0 = convert_float4(read_image_type(inputB, coord.xz)); \
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \
+        coord.z++; \
+ \
+        sum = sum + tempA0 * tempB0; \
+    } \
+    sum.x = sum.x * scale_out + zp_out; \
+    dst = convert_type(sum); \
+ \
+    write_image_type(output, coord.xy, dst); \
+}
+GEMM_TRANSA_2D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);
+GEMM_TRANSA_2D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);
+GEMM_TRANSA_2D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);
+
+#define GEMM_TRANSA_3D(name, dst_type, read_image_type, convert_type, write_image_type) \
+__kernel void gemm_transa_##name##_3D( \
+    __read_only image2d_array_t   inputA, \
+    __read_only image2d_array_t   inputB, \
+    __write_only image2d_array_t  output, \
+    int M, \
+    int K, \
+    int N, \
+    int ac2zero, \
+    int bc2zero, \
+    float scale_a, \
+    float zp_a, \
+    float scale_b, \
+    float zp_b, \
+    float scale_out, \
+    float zp_out \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0); \
+    int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0); \
+ \
+    float4 sum = (float4)(0); \
+    dst_type dst; \
+ \
+    for(; coord_a.y < K;) \
+    { \
+        float4 tempA0; \
+        float4 tempB0; \
+ \
+        tempA0 = convert_float4(read_image_type(inputA, coord_a)); \
+        tempB0 = convert_float4(read_image_type(inputB, coord_b)); \
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \
+        coord_a.y++; \
+        coord_b.y++; \
+ \
+        sum = sum + tempA0 * tempB0; \
+    } \
+    sum.x = sum.x * scale_out + zp_out; \
+    dst = convert_type(sum); \
+ \
+    coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
+    write_image_type(output, coord_b, dst); \
+}
+GEMM_TRANSA_3D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);
+GEMM_TRANSA_3D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);
+GEMM_TRANSA_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maxunpool.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maxunpool.cl
new file mode 100644
index 0000000..e1a2ebd
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/maxunpool.cl
@@ -0,0 +1,132 @@
+
+#define MAXUNPOOL(name, read_type, read_image_type, write_type, convert_type, writeimage_type) \
+__kernel void maxunpool_##name( \
+    __read_only  image2d_array_t  input0, \
+    __read_only  image2d_array_t  input1, \
+    __write_only image2d_array_t  output, \
+                 int              width_nopad, \
+                 int              height_nopad, \
+                 int              width_in, \
+                 int              height_in, \
+                 int              batch, \
+                 int              pad_left, \
+                 int              pad_top, \
+                 float            inputScale, \
+                 float            inputTail, \
+                 float            outputScale, \
+                 float            outputTail \
+    ) \
+{ \
+    uint gidx = get_global_id(0); \
+    uint gidy = get_global_id(1); \
+    uint gidz = get_global_id(2); \
+    int gidx_in, gidy_in, gidz_in; \
+    int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
+    write_type dst = (write_type)(0); \
+    float4 dst_temp = (float4)(0); \
+    int i,j,k; \
+    if (gidx < pad_left || gidx >= width_nopad + pad_left || \
+        gidy < pad_top || gidy >= height_nopad + pad_top) \
+    { \
+        dst_temp.x = outputTail; \
+        dst = convert_type(dst_temp); \
+        writeimage_type(output, coord_out, dst); \
+        return; \
+    } \
+    gidx_in = gidx - pad_left; \
+    gidy_in = gidy - pad_top; \
+    gidz_in = gidz; \
+    int index = gidz_in * height_nopad * width_nopad + gidy_in * width_nopad + gidx_in; \
+    for (k = 0;k < batch;k++) \
+    { \
+        for (j = 0;j < height_in; j++) \
+        { \
+            for (i = 0;i < width_in; i++) \
+            { \
+                int index_useful = read_imagei(input1, (int4)(i,j,k,0)).x; \
+                if (index_useful == index) \
+                { \
+                    read_type src = read_image_type(input0, (int4)(i,j,k,0)); \
+                    dst_temp = convert_float4(src) * inputScale + inputTail; \
+                    dst = convert_type(dst_temp * outputScale + outputTail); \
+                    writeimage_type(output, coord_out, dst); \
+                    return; \
+                } \
+            } \
+        } \
+    } \
+    dst_temp.x = outputTail; \
+    dst = convert_type(dst_temp); \
+    writeimage_type(output, coord_out, dst); \
+}
+MAXUNPOOL(F32toF32,float4,read_imagef,float4,convert_float4,write_imagef)
+MAXUNPOOL(F32toU32,float4,read_imagef,uint4, convert_uint4, write_imageui)
+MAXUNPOOL(F32toI32,float4,read_imagef,int4,  convert_int4,  write_imagei)
+
+MAXUNPOOL(U32toU32,uint4,read_imageui,uint4, convert_uint4, write_imageui)
+MAXUNPOOL(U32toF32,uint4,read_imageui,float4,convert_float4,write_imagef)
+MAXUNPOOL(U32toI32,uint4,read_imageui,int4,  convert_int4,  write_imagei)
+
+MAXUNPOOL(I32toU32,int4,read_imagei,uint4, convert_uint4, write_imageui)
+MAXUNPOOL(I32toF32,int4,read_imagei,float4,convert_float4,write_imagef)
+MAXUNPOOL(I32toI32,int4,read_imagei,int4,  convert_int4,  write_imagei)
+
+__kernel void maxunpool_BF16toBF16(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 int              width_nopad,
+                 int              height_nopad,
+                 int              width_in,
+                 int              height_in,
+                 int              batch,
+                 int              pad_left,
+                 int              pad_top,
+                 float            inputScale,
+                 float            inputTail,
+                 float            outputScale,
+                 float            outputTail
+    )
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+    int gidx_in, gidy_in, gidz_in;
+    int4 coord_out = (int4)(gidx, gidy, gidz, 0);
+    uint4 dst = (uint4)(0);
+    float4 dst_temp = (float4)(0);
+    int i,j,k;
+    if (gidx < pad_left || gidx >= width_nopad + pad_left ||
+        gidy < pad_top || gidy >= height_nopad + pad_top)
+    {
+        dst_temp.x = 0;
+        _viv_asm(COPY, dst, dst_temp, 16);
+        dst.x = dst.x >> 16;
+        write_imageui(output, coord_out, dst);
+        return;
+    }
+    gidx_in = gidx - pad_left;
+    gidy_in = gidy - pad_top;
+    gidz_in = gidz;
+    int index = gidz_in * height_nopad * width_nopad + gidy_in * width_nopad + gidx_in;
+    for (k = 0;k < batch;k++)
+    {
+        for (j = 0;j < height_in; j++)
+        {
+            for (i = 0;i < width_in; i++)
+            {
+                int index_useful = read_imagei(input1, (int4)(i,j,k,0)).x;
+                if (index_useful == index)
+                {
+                    uint4 src = read_imageui(input0, (int4)(i,j,k,0));
+                    write_imageui(output, coord_out, src);
+                    return;
+                }
+            }
+        }
+    }
+    dst_temp.x = 0;
+    _viv_asm(COPY, dst, dst_temp, 16);
+    dst.x = dst.x >> 16;
+    write_imageui(output, coord_out, dst);
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl b/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl
index a2ee944..083f172 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/pow.cl
@@ -2,7 +2,11 @@ __kernel void pow_FP32FP32toFP32
     (
     __read_only  image2d_array_t    input0,
     __read_only  image2d_array_t    input1,
-    __write_only image2d_array_t    output
+    __write_only image2d_array_t    output,
+                 float              inputScale,
+                 float              inputTail,
+                 float              outputScale,
+                 float              outputTail
     )
 {
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
@@ -15,7 +19,8 @@ __kernel void pow_FP32FP32toFP32
     float4  s0 = sign(src0);
     int4 t0 = convert_int4(src1) & 1;
     s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
-    dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ? (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);
+    dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ?
+         (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);
 
     write_imagef(output, coord, dst);
 }
@@ -24,7 +29,11 @@ __kernel void pow_FP32FP32toFP32_2D
     (
     __read_only  image2d_t    input0,
     __read_only  image2d_t    input1,
-    __write_only image2d_t    output
+    __write_only image2d_t    output,
+                 float        inputScale,
+                 float        inputTail,
+                 float        outputScale,
+                 float        outputTail
     )
 {
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
@@ -38,7 +47,69 @@ __kernel void pow_FP32FP32toFP32_2D
     int4 t0 = convert_int4(src1) & 1;
     s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
 
-    dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ? (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);
+    dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ?
+           (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);
 
     write_imagef(output, coord, dst);
 }
+
+__kernel void pow_U32F32toU32(
+    __read_only  image2d_array_t    input0,
+    __read_only  image2d_array_t    input1,
+    __write_only image2d_array_t    output,
+                 float              inputScale,
+                 float              inputTail,
+                 float              outputScale,
+                 float              outputTail
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    uint4 src0, dst;
+    float4 src0_f, src1, dst_f;
+    READ_IMAGEUI_2DARRAY(src0, input0, coord);
+    READ_IMAGEF_2DARRAY(src1, input1, coord);
+
+    src0_f = convert_float4(src0) * inputScale + inputTail;
+    float4  s0 = sign(src0_f);
+    int4 t0 = convert_int4(src1) & 1;
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
+    dst_f.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ?
+           (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);
+    dst.x = convert_uint(dst_f.x * outputScale + outputTail);
+
+    write_imageui(output, coord, dst);
+}
+
+__kernel void pow_U32F32toU32_2D
+    (
+    __read_only  image2d_t    input0,
+    __read_only  image2d_t    input1,
+    __write_only image2d_t    output,
+                 float        inputScale,
+                 float        inputTail,
+                 float        outputScale,
+                 float        outputTail
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+
+    uint4  src0 = read_imageui(input0, coord);
+    float4 src1 = read_imagef(input1, coord);
+
+    float4 src0_f = (float4)(0);
+    float4 dst_f  = (float4)(0);
+    uint4  dst    = (uint4)(0);
+
+    src0_f.x = convert_float(src0.x) * inputScale + inputTail;
+    float4  s0 = sign(src0_f);
+    int4 t0 = convert_int4(src1) & 1;
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;
+
+    dst_f.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ?
+         (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);
+    dst.x = convert_uint(dst_f.x * outputScale + outputTail);
+
+    write_imageui(output, coord, dst);
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/reversesequence.cl b/src/tim/vx/internal/src/libnnext/ops/cl/reversesequence.cl
new file mode 100644
index 0000000..94a3aaf
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/reversesequence.cl
@@ -0,0 +1,154 @@
+#define REVERSESEQUENCE_axis2(name,src_type,readimage_type,\
+                    convert_type,dst_type,writeimage_type) \
+__kernel void reversesequence_##name( \
+    __read_only  image2d_array_t  input0, \
+    __read_only  image2d_t        input1, \
+    __write_only image2d_array_t  output, \
+                 float            inoutScale, \
+                 float            inoutTail \
+    ) \
+{ \
+    uint gidx = get_global_id(0); \
+    uint gidy = get_global_id(1); \
+    uint gidz = get_global_id(2); \
+\
+    int4 coord_in = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_out = coord_in; \
+    src_type src = readimage_type(input0, coord_in); \
+    int src_index = read_imagei(input1, (int2)(gidz, 0)).x; \
+    float4 src_temp = convert_float4(src); \
+    dst_type dst = convert_type(src_temp * inoutScale + inoutTail); \
+    if (gidy >= src_index) \
+    { \
+        writeimage_type(output, coord_out, dst); \
+    } \
+    else \
+    { \
+        coord_out.y = src_index - 1 - coord_out.y; \
+        writeimage_type(output, coord_out, dst); \
+    } \
+}
+REVERSESEQUENCE_axis2(F32toF32_axis2,float4,read_imagef,\
+                      convert_float4,float4,write_imagef)
+REVERSESEQUENCE_axis2(F32toU32_axis2,float4,read_imagef,\
+                      convert_uint4, uint4, write_imageui)
+REVERSESEQUENCE_axis2(F32toI32_axis2,float4,read_imagef,\
+                       convert_int4,  int4,  write_imagei)
+REVERSESEQUENCE_axis2(I32toF32_axis2,int4,  read_imagei,\
+                       convert_float4,float4,write_imagef)
+REVERSESEQUENCE_axis2(I32toU32_axis2,int4,  read_imagei,\
+                     convert_uint4, uint4, write_imageui)
+REVERSESEQUENCE_axis2(I32toI32_axis2,int4,  read_imagei,\
+                      convert_int4,  int4,  write_imagei)
+REVERSESEQUENCE_axis2(U32toF32_axis2,uint4, read_imageui,\
+                      convert_float4,float4,write_imagef)
+REVERSESEQUENCE_axis2(U32toU32_axis2,uint4, read_imageui,\
+                     convert_uint4, uint4, write_imageui)
+REVERSESEQUENCE_axis2(U32toI32_axis2,uint4, read_imageui,\
+                        convert_int4,  int4,  write_imagei)
+
+__kernel void reversesequence_BF16toBF16_axis2(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                 float            inoutScale,
+                 float            inoutTail
+    )
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+
+    int4 coord_in = (int4)(gidx, gidy, gidz, 0);
+    int4 coord_out = coord_in;
+    uint4 src = read_imageui(input0, coord_in);
+    int src_index = read_imagei(input1, (int2)(gidz, 0)).x;
+    uint4 dst = src;
+    if (gidy >= src_index)
+    {
+        write_imageui(output, coord_out, dst);
+    }
+    else
+    {
+        coord_out.y = src_index - 1 - coord_out.y;
+        write_imageui(output, coord_out, dst);
+    }
+}
+
+
+#define REVERSESEQUENCE_axis1(name,src_type,readimage_type,\
+                             convert_type,dst_type,writeimage_type) \
+__kernel void reversesequence_##name( \
+    __read_only  image2d_array_t  input0, \
+    __read_only  image2d_t        input1, \
+    __write_only image2d_array_t  output, \
+                 float            inoutScale, \
+                 float            inoutTail \
+    ) \
+{ \
+    uint gidx = get_global_id(0); \
+    uint gidy = get_global_id(1); \
+    uint gidz = get_global_id(2); \
+\
+    int4 coord_in = (int4)(gidx, gidy, gidz, 0); \
+    int4 coord_out = coord_in; \
+    src_type src = readimage_type(input0, coord_in); \
+    int src_index = read_imagei(input1, (int2)(gidy, 0)).x; \
+    float4 src_temp = convert_float4(src); \
+    dst_type dst = convert_type(src_temp * inoutScale + inoutTail ); \
+    if (gidz >= src_index) \
+    { \
+        writeimage_type(output, coord_out, dst); \
+    } \
+    else \
+    { \
+        coord_out.z = src_index - 1 - coord_out.z; \
+        writeimage_type(output, coord_out, dst); \
+    } \
+}
+REVERSESEQUENCE_axis1(F32toF32_axis1,float4,read_imagef,\
+                     convert_float4,float4,write_imagef)
+REVERSESEQUENCE_axis1(F32toU32_axis1,float4,read_imagef,\
+                     convert_uint4, uint4, write_imageui)
+REVERSESEQUENCE_axis1(F32toI32_axis1,float4,read_imagef,\
+                     convert_int4,  int4,  write_imagei)
+REVERSESEQUENCE_axis1(I32toF32_axis1,int4,  read_imagei,\
+                     convert_float4,float4,write_imagef)
+REVERSESEQUENCE_axis1(I32toU32_axis1,int4,  read_imagei,\
+                     convert_uint4, uint4, write_imageui)
+REVERSESEQUENCE_axis1(I32toI32_axis1,int4,  read_imagei,\
+                     convert_int4,  int4,  write_imagei)
+REVERSESEQUENCE_axis1(U32toF32_axis1,uint4, read_imageui,\
+                     convert_float4,float4,write_imagef)
+REVERSESEQUENCE_axis1(U32toU32_axis1,uint4, read_imageui,\
+                     convert_uint4, uint4, write_imageui)
+REVERSESEQUENCE_axis1(U32toI32_axis1,uint4, read_imageui,\
+                      convert_int4,  int4,  write_imagei)
+
+__kernel void reversesequence_BF16toBF16_axis1(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                 float            inoutScale,
+                 float            inoutTail
+    )
+{
+    uint gidx = get_global_id(0);
+    uint gidy = get_global_id(1);
+    uint gidz = get_global_id(2);
+
+    int4 coord_in = (int4)(gidx, gidy, gidz, 0);
+    int4 coord_out = coord_in;
+    uint4 src = read_imageui(input0, coord_in);
+    int src_index = read_imagei(input1, (int2)(gidy, 0)).x;
+    uint4 dst = src;
+    if (gidz >= src_index)
+    {
+        write_imageui(output, coord_out, dst);
+    }
+    else
+    {
+        coord_out.z = src_index - 1 - coord_out.z;
+        write_imageui(output, coord_out, dst);
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
index 91b10d9..58d7273 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
@@ -1,3 +1,4 @@
+#define VSI_NN_ROI_ALIGN_ANDROID 0
 
 inline float roi_align_1x1
 (
@@ -8,7 +9,8 @@ inline float roi_align_1x1
                  int2   grid_size,
                  float2 rcp_of_grid_size,
                  int    pz,
-                 int4   max_spatial_dims
+                 int4   max_spatial_dims,
+                 int    platform_type
 )
 {
     float sum = 0;
@@ -23,10 +25,21 @@ inline float roi_align_1x1
             int2 xy_low  = convert_int2(pos);
             int2 xy_high = xy_low + 1;
 
-            if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||
-                xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )
+            if (VSI_NN_ROI_ALIGN_ANDROID == platform_type)
             {
-                continue;
+                if (xy_low.x > max_spatial_dims.x || xy_low.x < -1 ||
+                    xy_low.y > max_spatial_dims.y || xy_low.y < -1 )
+                {
+                    continue;
+                }
+            }
+            else
+            {
+                if (pos.x > max_spatial_dims.x || pos.x < -1 ||
+                    pos.y > max_spatial_dims.y || pos.y < -1 )
+                {
+                    continue;
+                }
             }
 
             float2 lxy = pos - floor(pos);
@@ -76,7 +89,8 @@ __kernel void roi_align_F32_F32toF32
                  float           sampling_x_ratio,
                  float           sampling_y_ratio,
                  int             depth,
-                 int             dtype
+                 int             dtype,
+                 int             platform_type
 )
 {
     int px = get_global_id(0);
@@ -122,7 +136,8 @@ __kernel void roi_align_F32_F32toF32
                        grid_size_xy,
                        rcp_of_grid_size,
                        kz,
-                       max_spatial_dims);
+                       max_spatial_dims,
+                       platform_type);
 
         if (dtype == TYPE_FLOAT16)
         {
@@ -138,10 +153,9 @@ __kernel void roi_align_F32_F32toF32
         }
         else
         {
-            Tensor out_t =  create_tensor_from_image2d_array(output, 4);
-            float *output_ptr = (float *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));
-
-            output_ptr[0] = interp.x;
+            float4 dst = (float4)(interp.x,0,0,0);
+            int4 coord_dst = (int4)(px, py, kz1, 0);
+            write_imagef(output,coord_dst,dst);
         }
     }
 }
@@ -157,7 +171,8 @@ inline float roi_align_1x1_U8toF32
                 int2             grid_size,
                 float2           rcp_of_grid_size,
                 int              pz,
-                int4             max_spatial_dims
+                int4             max_spatial_dims,
+                int              platform_type
 )
 {
     float sum = 0;
@@ -168,41 +183,52 @@ inline float roi_align_1x1_U8toF32
         {
             float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);
             float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;
-    
+
             int2 xy_low  = convert_int2(pos);
             int2 xy_high = xy_low + 1;
-    
+
             float2 lxy = pos - floor(pos);
             float2 zero = 0;
-    
-            if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||
-                xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )
+
+            if (VSI_NN_ROI_ALIGN_ANDROID == platform_type)
             {
-                continue;
+                if (xy_low.x > max_spatial_dims.x || xy_low.x < -1 ||
+                    xy_low.y > max_spatial_dims.y || xy_low.y < -1 )
+                {
+                    continue;
+                }
             }
-    
+            else
+            {
+                if (pos.x > max_spatial_dims.x || pos.x < -1 ||
+                    pos.y > max_spatial_dims.y || pos.y < -1 )
+                {
+                    continue;
+                }
+            }
+
             lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;
-    
+
             float hy = 1.0f - lxy.y;
             float hx = 1.0f - lxy.x;
-    
+
             float w1 = hy * hx;
             float w2 = lxy.x - lxy.x * lxy.y;
             float w3 = lxy.y - lxy.x * lxy.y;
             float w4 = lxy.y * lxy.x;
-    
+
             uint4 data;
             data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;
             data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;
             data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;
             data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;
-    
+
             float4 value = convert_float4(data) * input_scale + input_tail;
-    
+
             sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w;
         }
     }
-    
+
     return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);
 
 }
@@ -226,7 +252,8 @@ __kernel void roi_align_U8_U16toU8
                  float           sampling_x_ratio,
                  float           sampling_y_ratio,
                  int             depth,
-                 int             dtype
+                 int             dtype,
+                 int             platform_type
 )
 {
     int px = get_global_id(0);
@@ -274,7 +301,8 @@ __kernel void roi_align_U8_U16toU8
                        grid_size_xy,
                        rcp_of_grid_size,
                        kz,
-                       max_spatial_dims);
+                       max_spatial_dims,
+                       platform_type);
 
         uchar dst;
         interp.x = interp.x * output_scale + output_zp;
@@ -283,7 +311,7 @@ __kernel void roi_align_U8_U16toU8
 
         Tensor out_t =  create_tensor_from_image2d_array(output, 1);
         uchar *output_ptr = (uchar *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));
-        
+
         output_ptr[0] = dst;
     }
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl b/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl
index 0a6035c..95254d2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl
@@ -47,7 +47,7 @@ __kernel void swish_F32toF32_2D(
     src   = convert_float4(src0) * inputScale - inputTail; \
     tmp.x = sigmoid_(src.x * beta, logE); \
     data.x = src.x * tmp.x; \
-    uint4 dst = convert_uint4(data * outputScale + outputZP); \
+    uint4 dst = convert_uint4_rte(data * outputScale + outputZP); \
     write_imageui(output, coord, dst);
 
 __kernel void swish_U8toU8(
@@ -115,3 +115,39 @@ __kernel void swish_I32toI32_2D(
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
     SWISH_I32_I32_PROCESS()
 }
+
+#define SWISH_F32_U8_PROCESS() \
+    float4 src, tmp, data; \
+    src = read_imagef(input, coord); \
+    tmp.x = sigmoid_(src.x * beta, logE); \
+    data.x = src.x * tmp.x; \
+    uint4 dst = convert_uint4(data * outputScale + outputZP); \
+    write_imageui(output, coord, dst);
+
+__kernel void swish_F32toU8(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                 float            inputScale,
+                 float            inputTail,
+                 float            outputScale,
+                 float            outputZP,
+                 float            beta,
+                 float            logE)
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    SWISH_F32_U8_PROCESS()
+}
+
+__kernel void swish_F32toU8_2D(
+    __read_only  image2d_t        input,
+    __write_only image2d_t        output,
+                 float            inputScale,
+                 float            inputTail,
+                 float            outputScale,
+                 float            outputZP,
+                 float            beta,
+                 float            logE)
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    SWISH_F32_U8_PROCESS()
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
index 2596e66..0e6166c 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@@ -4,6 +4,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
   __read_only  image2d_t input, \
   __write_only image2d_t output, \
   __write_only image2d_t indices, \
+               float     input_scale, \
+               float     input_tail, \
+               float     output_scale, \
+               float     output_tail, \
                int       num_stages, \
                int       width \
   ) \
@@ -88,6 +92,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
   __read_only  image2d_t input, \
   __write_only image2d_t output, \
   __write_only image2d_t indices, \
+               float     input_scale, \
+               float     input_tail, \
+               float     output_scale, \
+               float     output_tail, \
                int       num_stages, \
                int       width \
   ) \
@@ -172,6 +180,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
   __read_only  image2d_t input, \
   __write_only image2d_t output, \
   __write_only image2d_t indices, \
+               float     input_scale, \
+               float     input_tail, \
+               float     output_scale, \
+               float     output_tail, \
                int       num_stages, \
                int       width \
   ) \
@@ -249,3 +261,179 @@ TOPK_I32(1 << 3, 3)
 TOPK_I32(1 << 4, 4)
 TOPK_I32(1 << 5, 5)
 TOPK_I32(1 << 6, 6)
+
+#define TOPK_F32toU32(LOCAL_SIZE0, STAGES) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toU32_I32 \
+ ( \
+  __read_only  image2d_t input, \
+  __write_only image2d_t output, \
+  __write_only image2d_t indices, \
+               float     input_scale, \
+               float     input_tail, \
+               float     output_scale, \
+               float     output_tail, \
+               int       num_stages, \
+               int       width \
+  ) \
+ { \
+    uint local_id = get_local_id(0); \
+    uint work_group_size = get_local_size(0); \
+    uint offset = 0; \
+ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    __local float local_data[128]; \
+    __local uint local_indices[128]; \
+ \
+    float left = read_imagef(input, coord.xy).x; \
+    coord.z += work_group_size; \
+    float data = read_imagef(input, coord.zy).x; \
+    float right = coord.z < width ? data : -2147483647.0f; \
+ \
+    local_data[local_id] = left; \
+    local_indices[local_id] = local_id; \
+    local_data[local_id + work_group_size] = right; \
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \
+ \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (local_id >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            uint left_idx = local_indices[left_id]; \
+            uint right_idx = local_indices[right_id]; \
+ \
+            float left_elem = local_data[left_id]; \
+            float right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+ \
+    uint4 dst; \
+    dst.x = convert_uint(local_data[local_id] * output_scale + output_tail); \
+    dst.y = convert_uint(local_data[local_id + work_group_size] * output_scale + output_tail); \
+    write_imageui(output, coord.xy, dst.xxxx); \
+    write_imageui(output, coord.zy, dst.yyyy); \
+ \
+    int4 index; \
+    index.x = ((int*)local_indices)[local_id]; \
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \
+ \
+    write_imagei(indices, coord.xy, index.xxxx); \
+    write_imagei(indices, coord.zy, index.yyyy); \
+ }
+
+TOPK_F32toU32(1 << 0, 0)
+TOPK_F32toU32(1 << 1, 1)
+TOPK_F32toU32(1 << 2, 2)
+TOPK_F32toU32(1 << 3, 3)
+TOPK_F32toU32(1 << 4, 4)
+TOPK_F32toU32(1 << 5, 5)
+TOPK_F32toU32(1 << 6, 6)
+
+#define TOPK_F32toI32(LOCAL_SIZE0, STAGES) \
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toI32_I32 \
+ ( \
+  __read_only  image2d_t input, \
+  __write_only image2d_t output, \
+  __write_only image2d_t indices, \
+               float     input_scale, \
+               float     input_tail, \
+               float     output_scale, \
+               float     output_tail, \
+               int       num_stages, \
+               int       width \
+  ) \
+ { \
+    uint local_id = get_local_id(0); \
+    uint work_group_size = get_local_size(0); \
+    uint offset = 0; \
+ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
+ \
+    __local float local_data[128]; \
+    __local uint local_indices[128]; \
+ \
+    float left = read_imagef(input, coord.xy).x; \
+    coord.z += work_group_size; \
+    float data = read_imagef(input, coord.zy).x; \
+    float right = coord.z < width ? data : -2147483647.0f; \
+ \
+    local_data[local_id] = left; \
+    local_indices[local_id] = local_id; \
+    local_data[local_id + work_group_size] = right; \
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \
+ \
+    barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (local_id >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            uint left_idx = local_indices[left_id]; \
+            uint right_idx = local_indices[right_id]; \
+ \
+            float left_elem = local_data[left_id]; \
+            float right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+ \
+    int4 dst; \
+    dst.x = convert_int(local_data[local_id] * output_scale + output_tail); \
+    dst.y = convert_int(local_data[local_id + work_group_size] * output_scale + output_tail); \
+    write_imagei(output, coord.xy, dst.xxxx); \
+    write_imagei(output, coord.zy, dst.yyyy); \
+ \
+    int4 index; \
+    index.x = ((int*)local_indices)[local_id]; \
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \
+ \
+    write_imagei(indices, coord.xy, index.xxxx); \
+    write_imagei(indices, coord.zy, index.yyyy); \
+ }
+
+TOPK_F32toI32(1 << 0, 0)
+TOPK_F32toI32(1 << 1, 1)
+TOPK_F32toI32(1 << 2, 2)
+TOPK_F32toI32(1 << 3, 3)
+TOPK_F32toI32(1 << 4, 4)
+TOPK_F32toI32(1 << 5, 5)
+TOPK_F32toI32(1 << 6, 6)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
index 0b8f988..beaaccb 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
@@ -6,6 +6,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                image2d_t indices_t,
   __write_only image2d_t output,
   __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
                int       width
   )
  {
@@ -115,6 +119,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                image2d_t indices_t,
   __write_only image2d_t output,
   __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
                int       width
   )
  {
@@ -224,6 +232,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                image2d_t indices_t,
   __write_only image2d_t output,
   __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
                int       width
   )
  {
@@ -324,4 +336,4 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
         write_imagei(output, coord.xy, data);
         write_imagei(indices, coord.xy, index);
     }
-}
\ No newline at end of file
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl
new file mode 100644
index 0000000..976da20
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl
@@ -0,0 +1,230 @@
+#define LOCAL_SIZE_X    (32)
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toU32_I32
+ (
+  __read_only  image2d_t input,
+               image2d_t input_t,
+               image2d_t indices_t,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       width
+  )
+ {
+    uint lid = get_local_id(0);
+    uint work_group_size = get_local_size(0);
+    uint offset = 0;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        float4 data = read_imagef(input, coord.xy);
+
+        write_imagef(input_t, coord.xy, data);
+        write_imagei(indices_t, coord.xy, coord.xxxx);
+    }
+
+    __local int sorted[1];
+    int width_minus_one = width - 1;
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
+
+    int x_start = lid * num_pixels_per_thread;
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
+
+    sorted[0] = 0;
+
+    while (1)
+    {
+        if (lid == 0)
+        {
+            *sorted = 0;
+        }
+        int swapped = 0;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        // odd-even
+        coord.x = x_start;
+        coord.z = x_start + 1;
+        for (; coord.x < x_end; )
+        {
+            float4 left = read_imagef(input_t, coord.xy);
+            float4 right = read_imagef(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imagef(input_t, coord.xy, right);
+                write_imagef(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        // even-odd
+        coord.x = x_start + 1;
+        coord.z = x_start + 2;
+        for (; coord.x < x_end; )
+        {
+            float4 left = read_imagef(input_t, coord.xy);
+            float4 right = read_imagef(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imagef(input_t, coord.xy, right);
+                write_imagef(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        atomic_add(sorted, swapped);
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        if (*sorted == 0)
+            break;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+    }
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        float4 data = read_imagef(input_t, coord.xy);
+        int4 index = read_imagei(indices_t, coord.xy);
+
+        uint4 dst;
+        dst = convert_uint4(data * output_scale + output_tail);
+        write_imageui(output, coord.xy, dst);
+        write_imagei(indices, coord.xy, index);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toI32_I32
+ (
+  __read_only  image2d_t input,
+               image2d_t input_t,
+               image2d_t indices_t,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       width
+  )
+ {
+    uint lid = get_local_id(0);
+    uint work_group_size = get_local_size(0);
+    uint offset = 0;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        float4 data = read_imagef(input, coord.xy);
+
+        write_imagef(input_t, coord.xy, data);
+        write_imagei(indices_t, coord.xy, coord.xxxx);
+    }
+
+    __local int sorted[1];
+    int width_minus_one = width - 1;
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
+
+    int x_start = lid * num_pixels_per_thread;
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
+
+    sorted[0] = 0;
+
+    while (1)
+    {
+        if (lid == 0)
+        {
+            *sorted = 0;
+        }
+        int swapped = 0;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        // odd-even
+        coord.x = x_start;
+        coord.z = x_start + 1;
+        for (; coord.x < x_end; )
+        {
+            float4 left = read_imagef(input_t, coord.xy);
+            float4 right = read_imagef(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imagef(input_t, coord.xy, right);
+                write_imagef(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        // even-odd
+        coord.x = x_start + 1;
+        coord.z = x_start + 2;
+        for (; coord.x < x_end; )
+        {
+            float4 left = read_imagef(input_t, coord.xy);
+            float4 right = read_imagef(input_t, coord.zy);
+
+            if (left.x < right.x)
+            {
+                int4 l_index = read_imagei(indices_t, coord.xy);
+                int4 r_index = read_imagei(indices_t, coord.zy);
+                swapped = 1;
+
+                write_imagef(input_t, coord.xy, right);
+                write_imagef(input_t, coord.zy, left);
+
+                write_imagei(indices_t, coord.xy, r_index);
+                write_imagei(indices_t, coord.zy, l_index);
+            }
+
+            coord.xz = coord.xz + 2;
+        }
+
+        atomic_add(sorted, swapped);
+        barrier(CLK_GLOBAL_MEM_FENCE);
+
+        if (*sorted == 0)
+            break;
+        barrier(CLK_GLOBAL_MEM_FENCE);
+    }
+
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)
+    {
+        float4 data = read_imagef(input_t, coord.xy);
+        int4 index = read_imagei(indices_t, coord.xy);
+
+        int4 dst;
+        dst = convert_int4(data * output_scale + output_tail);
+        write_imagei(output, coord.xy, dst);
+        write_imagei(indices, coord.xy, index);
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_BF16_to_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_BF16_to_BF16.vx
new file mode 100644
index 0000000..cf895d8
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_BF16_to_BF16.vx
@@ -0,0 +1,159 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;
+
+_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;
+_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;
+
+#define GRID_SAMPLE_BF16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 top; \
+    vxc_short8 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    vxc_ushort8 tmp, dst; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+        _viv_asm(COPY, right4, src, 16); \
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+        _viv_asm(COPY, left4,  src, 16); \
+        right4    -= left4; \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+        _viv_asm(COPY, right4, src, 16); \
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+        _viv_asm(COPY, left4,  src, 16); \
+        right4    -= left4; \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        _viv_asm(COPY, tmp, dst4, 16); \
+        dst.s0123 = tmp.s1357; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+    _viv_asm(COPY, right4, src, 16); \
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+    _viv_asm(COPY, left4,  src, 16); \
+    right4    -= left4; \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+    _viv_asm(COPY, right4, src, 16); \
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+    _viv_asm(COPY, left4,  src, 16); \
+    right4    -= left4; \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    _viv_asm(COPY, tmp, dst4, 16); \
+    dst.s0123 = tmp.s1357; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+
+__kernel void bilinear_grid_sample_BF16_BF16toBF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    float4 fxy0;
+    float4 fxy1;
+
+    vxc_short8 src;
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);
+    _viv_asm(COPY, fxy0, src, 16);
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);
+    _viv_asm(COPY, fxy1, src, 16);
+
+
+
+    GRID_SAMPLE_BF16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_F16_to_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_F16_to_F16.vx
new file mode 100644
index 0000000..c0634f6
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_F16_to_F16.vx
@@ -0,0 +1,205 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
+
+#define GRID_SAMPLE_F16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 t0; \
+    vxc_short8 b0; \
+    vxc_half8 top; \
+    vxc_half8 bottom; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, top, t0, 16); \
+    _viv_asm(COPY, bottom, b0, 16); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        half4 tmp; \
+        _viv_asm(CONV, tmp, dst4); \
+        VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
+        vxc_short4 result; \
+        _viv_asm(COPY, result, top, 8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, top, t0, 16); \
+        _viv_asm(COPY, bottom, b0, 16); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    half4 tmp; \
+    _viv_asm(CONV, tmp, dst4); \
+    VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
+    vxc_short4 result; \
+    _viv_asm(COPY, result, top, 8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void bilinear_grid_sample_F16_F32toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_F16_U8toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_uchar16 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    float4 fxy0;
+    float4 fxy1;
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
+
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_F16_F16toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_F16_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_F16_to_U8.vx
new file mode 100644
index 0000000..d462cf3
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_F16_to_U8.vx
@@ -0,0 +1,212 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
+
+#define GRID_SAMPLE_F16_to_U8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 t0; \
+    vxc_short8 b0; \
+    vxc_uchar16 result; \
+    vxc_half8 top; \
+    vxc_half8 bottom; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, top, t0, 16); \
+    _viv_asm(COPY, bottom, b0, 16); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * uint8Scale + output_ZP; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+        result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, top, t0, 16); \
+        _viv_asm(COPY, bottom, b0, 16); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * uint8Scale + output_ZP; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+    result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void bilinear_grid_sample_F16_F32toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+
+__kernel void bilinear_grid_sample_F16_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_uchar16 read_coord;
+
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_F16_F16toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_I16_to_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_I16_to_I16.vx
new file mode 100644
index 0000000..23d34e7
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_I16_to_I16.vx
@@ -0,0 +1,148 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform float input1_scale;
+_viv_uniform float dfpScale;
+
+#define GRID_SAMPLE_I16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 top; \
+    vxc_short8 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * dfpScale; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * dfpScale; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void bilinear_grid_sample_I16_I16toI16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_short8 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1_scale;
+    fxy1 = fxy1 * input1_scale;
+
+    GRID_SAMPLE_I16_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_I8_to_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_I8_to_I8.vx
new file mode 100644
index 0000000..ea1b600
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_I8_to_I8.vx
@@ -0,0 +1,148 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform float input1_scale;
+_viv_uniform float dfpScale;
+
+#define GRID_SAMPLE_I8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_char16 top; \
+    vxc_char16 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * dfpScale; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * dfpScale; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void bilinear_grid_sample_I8_I8toI8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_char16 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1_scale;
+    fxy1 = fxy1 * input1_scale;
+
+    GRID_SAMPLE_I8_PROCESS();
+
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_U8_to_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_U8_to_U8.vx
new file mode 100644
index 0000000..c8e330b
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_U8_to_U8.vx
@@ -0,0 +1,213 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform int input_ZP;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+
+#define GRID_SAMPLE_U8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_uchar16 top; \
+    vxc_uchar16 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        unsigned char inputZP; \
+        _viv_asm(COPY, inputZP, input_ZP, 4); \
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, inputZP, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * uint8Scale + output_ZP; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    unsigned char inputZP; \
+    _viv_asm(COPY, inputZP, input_ZP, 4); \
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, inputZP, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * uint8Scale + output_ZP; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void bilinear_grid_sample_U8_F32toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
+
+__kernel void bilinear_grid_sample_U8_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_uchar16 read_coord;
+
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_U8_F16toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_0.vx
index 69c3ede..5443a7f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_0.vx
@@ -101,6 +101,15 @@ float4 eltwise_unary_hard_gelu(float4 x)
     return x * cdf;
 }
 
+float4 eltwise_unary_inverse_sigmoid(float4 x)
+{
+    float4 x1, x2;
+    x = clamp(x, 0, 1);
+    x1 = clamp(x, alpha, 1);
+    x2 = clamp((1 - x), alpha, 1);
+    return log(x1 / x2);
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -200,6 +209,17 @@ ELTSISE_UNARY_2D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8,
 ELTSISE_UNARY_2D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_2D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_2D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//INVERSE_SIGMOID
+ELTSISE_UNARY_2D(inverse_sigmoid, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(inverse_sigmoid, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(inverse_sigmoid, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(inverse_sigmoid, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(inverse_sigmoid, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(inverse_sigmoid, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(inverse_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(inverse_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(inverse_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(inverse_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
@@ -244,3 +264,5 @@ ELTSISE_UNARY_BF16_2D(round)
 ELTSISE_UNARY_BF16_2D(gelu)
 //HARD_GELU
 ELTSISE_UNARY_BF16_2D(hard_gelu)
+//INVERSE_SIGMOID
+ELTSISE_UNARY_BF16_2D(inverse_sigmoid)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
index 8b7a639..9a6a9fe 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
@@ -66,6 +66,21 @@ float4 eltwise_unary_softsign(float4 val)
     return val * _rcp;
 }
 
+float4 eltwise_unary_atan(float4 val)
+{
+    return atan(val);
+}
+
+float4 eltwise_unary_atanh(float4 val)
+{
+    return atanh(val);
+}
+
+float4 eltwise_unary_acosh(float4 val)
+{
+    return acosh(val);
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -176,4 +191,11 @@ ADD_ELTSISE_UNARY_2D(rcp)
 //SIGN
 ADD_ELTSISE_UNARY_2D(sign)
 //SOFTSIGN
-ADD_ELTSISE_UNARY_2D(softsign)
\ No newline at end of file
+ADD_ELTSISE_UNARY_2D(softsign)
+//ATAN
+ADD_ELTSISE_UNARY_2D(atan)
+//ATANH
+ADD_ELTSISE_UNARY_2D(atanh)
+//ACOSH
+ADD_ELTSISE_UNARY_2D(acosh)
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_0.vx
index d04ec5a..edaa1fe 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_0.vx
@@ -101,6 +101,15 @@ float4 eltwise_unary_hard_gelu(float4 x)
     return x * cdf;
 }
 
+float4 eltwise_unary_inverse_sigmoid(float4 x)
+{
+    float4 x1, x2;
+    x = clamp(x, 0, 1);
+    x1 = clamp(x, alpha, 1);
+    x2 = clamp((1 - x), alpha, 1);
+    return log(x1 / x2);
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -200,6 +209,17 @@ ELTSISE_UNARY_3D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8,
 ELTSISE_UNARY_3D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_3D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_3D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//INVERSE_SIGMOID
+ELTSISE_UNARY_3D(inverse_sigmoid, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(inverse_sigmoid, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(inverse_sigmoid, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(inverse_sigmoid, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(inverse_sigmoid, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(inverse_sigmoid, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(inverse_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(inverse_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(inverse_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(inverse_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
@@ -242,4 +262,6 @@ ELTSISE_UNARY_BF16(round)
 //GELU
 ELTSISE_UNARY_BF16(gelu)
 //HARD_GELU
-ELTSISE_UNARY_BF16(hard_gelu)
\ No newline at end of file
+ELTSISE_UNARY_BF16(hard_gelu)
+//INVERSE_SIGMOID
+ELTSISE_UNARY_BF16(inverse_sigmoid)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
index df52777..f53c3ff 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
@@ -66,6 +66,21 @@ float4 eltwise_unary_softsign(float4 val)
     return val * _rcp;
 }
 
+float4 eltwise_unary_atan(float4 val)
+{
+    return atan(val);
+}
+
+float4 eltwise_unary_atanh(float4 val)
+{
+    return atanh(val);
+}
+
+float4 eltwise_unary_acosh(float4 val)
+{
+    return acosh(val);
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -176,3 +191,9 @@ ADD_ELTSISE_UNARY_3D(rcp)
 ADD_ELTSISE_UNARY_3D(sign)
 //SOFTSIGN
 ADD_ELTSISE_UNARY_3D(softsign)
+//ATAN
+ADD_ELTSISE_UNARY_3D(atan)
+//ATANH
+ADD_ELTSISE_UNARY_3D(atanh)
+//ACOSH
+ADD_ELTSISE_UNARY_3D(acosh)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
new file mode 100644
index 0000000..c479a3b
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
@@ -0,0 +1,94 @@
+#include "cl_viv_vx_ext.h"
+
+__kernel void gather_nd_batch_I8toI8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch
+
+    int4 coord = (int4)(gidx, gidy, 0, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.z = indice.x * block_size + gidx;
+
+    vxc_char16 src;
+    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_nd_batch_U8toU8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, 0, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.z = indice.x * block_size + gidx;
+
+    vxc_uchar16 src;
+    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_nd_batch_I16toI16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, 0, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.z = indice.x * block_size + gidx;
+
+    vxc_short8 src;
+    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_nd_batch_F16toF16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, 0, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.z = indice.x * block_size + gidx;
+
+    vxc_short8 src;
+    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
new file mode 100644
index 0000000..acc6c4c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
@@ -0,0 +1,98 @@
+#include "cl_viv_vx_ext.h"
+
+__kernel void gather_nd_batch_I8toI8_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch num
+
+    int4 coord = (int4)(gidx, 0, gidy, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    vxc_char16 src;
+    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_nd_U8toU8_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch num
+
+    int4 coord = (int4)(gidx, 0, gidy, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    vxc_uchar16 src;
+    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_nd_I16toI16_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch num
+
+    int4 coord = (int4)(gidx, 0, gidy, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    vxc_short8 src;
+    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void gather_nd_F16toF16_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // batch num
+
+    int4 coord = (int4)(gidx, 0, gidy, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    vxc_short8 src;
+    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l1norm_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l1norm_axis0.vx
new file mode 100644
index 0000000..9e7b5e1
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l1norm_axis0.vx
@@ -0,0 +1,166 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+#define epsilon 1e-12
+
+_viv_uniform VXC_512Bits uniExtract8Bin_2x8;
+_viv_uniform VXC_512Bits ExtractBin_part0_4x4;
+_viv_uniform VXC_512Bits ExtractBin_part1_4x4;
+
+
+
+#define L1_NORM_AXIS0_SH_2D(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \
+__kernel void l1norm_##name0##to##name1##_2D_axis0 \
+    ( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int2 coord = (int2)(0, get_global_id(0)); \
+    src_type    v0, v1; \
+    conv_type0  src0, src1; \
+    conv_type1  dst0, dst1; \
+    dst_type    dst; \
+    save_type   out; \
+    float4 src0_f, src1_f, src2_f, src3_f; \
+ \
+    float4 sum = 0; \
+    float4 total = 0; \
+    float4 rcp_total = 0; \
+    half4  rcp_total_half = 0; \
+    float4 one4 = (float4)(1.0f, 1.0f, 1.0f, 1.0f); \
+    do \
+    { \
+        VXC_ReadImage(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        VXC_ReadImage(v1, input, coord, VXC_5BITOFFSET_XY(8, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src1, v1, 16); \
+        coord.x = coord.x + 16; \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src2_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        VXC_DP4x4(src3_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = fabs(src0_f - inputZp); \
+        src1_f = fabs(src1_f - inputZp); \
+        src2_f = fabs(src2_f - inputZp); \
+        src3_f = fabs(src3_f - inputZp); \
+        sum = src0_f + src1_f + src2_f + src3_f; \
+        total = total + dot(sum, one4); \
+    } while (coord.x < axis_size); \
+ \
+    total = total > epsilon ? total : epsilon; \
+    rcp_total = 1 / total * outputscale; \
+    coord.x = 0; \
+    do \
+    { \
+        VXC_ReadImage(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = (src0_f - inputZp) * rcp_total.x + outputtail; \
+        src1_f = (src1_f - inputZp) * rcp_total.x + outputtail; \
+        _viv_asm(CONV_RTE, dst0, src0_f); \
+        _viv_asm(CONV_RTE, dst1, src1_f); \
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \
+        _viv_asm(COPY, out, dst, 16); \
+        VXC_WriteImage(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x = coord.x + 8; \
+    } while (coord.x < axis_size); \
+}
+L1_NORM_AXIS0_SH_2D(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS0_SH_2D(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS0_SH_2D(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS0_SH_2D(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS0_SH_2D(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS0_SH_2D(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS0_SH_2D(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS0_SH_2D(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS0_SH_2D(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS0_SH_2D(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)
+
+
+#define L1_NORM_AXIS0_SH(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \
+__kernel void l1norm_##name0##to##name1##_axis0 \
+    ( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \
+    src_type    v0, v1; \
+    conv_type0  src0, src1; \
+    conv_type1  dst0, dst1; \
+    dst_type    dst; \
+    save_type   out; \
+    float4 src0_f, src1_f, src2_f, src3_f; \
+ \
+    float4 sum = 0; \
+    float4 total = 0; \
+    float4 rcp_total = 0; \
+    half4  rcp_total_half = 0; \
+    float4 one4 = (float4)(1.0f, 1.0f, 1.0f, 1.0f); \
+    do \
+    { \
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        VXC_ReadImage2DArray(v1, input, coord, VXC_5BITOFFSET_XY(8, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src1, v1, 16); \
+        coord.x = coord.x + 16; \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src2_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        VXC_DP4x4(src3_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = fabs(src0_f - inputZp); \
+        src1_f = fabs(src1_f - inputZp); \
+        src2_f = fabs(src2_f - inputZp); \
+        src3_f = fabs(src3_f - inputZp); \
+        sum = src0_f + src1_f + src2_f + src3_f; \
+        total = total + dot(sum, one4); \
+    } while (coord.x < axis_size); \
+ \
+    total = total > epsilon ? total : epsilon; \
+    rcp_total = 1 / total * outputscale; \
+    coord.x = 0; \
+    do \
+    { \
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = (src0_f - inputZp) * rcp_total.x + outputtail; \
+        src1_f = (src1_f - inputZp) * rcp_total.x + outputtail; \
+        _viv_asm(CONV_RTE, dst0, src0_f); \
+        _viv_asm(CONV_RTE, dst1, src1_f); \
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \
+        _viv_asm(COPY, out, dst, 16); \
+        VXC_WriteImage2DArray(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.x = coord.x + 8; \
+    } while (coord.x < axis_size); \
+}
+L1_NORM_AXIS0_SH(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS0_SH(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS0_SH(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS0_SH(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS0_SH(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS0_SH(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS0_SH(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS0_SH(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS0_SH(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS0_SH(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)
+
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l1norm_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l1norm_axis1.vx
new file mode 100644
index 0000000..6cca631
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l1norm_axis1.vx
@@ -0,0 +1,147 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+#define epsilon 1e-12
+
+_viv_uniform VXC_512Bits uniExtract8Bin_2x8;
+_viv_uniform VXC_512Bits ExtractBin_part0_4x4;
+_viv_uniform VXC_512Bits ExtractBin_part1_4x4;
+
+
+#define L1_NORM_AXIS1_SH_2D(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \
+__kernel void l1norm_##name0##to##name1##_2D_axis1 \
+    ( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int2 coord = (int2)(get_global_id(0),0); \
+    src_type    v0; \
+    conv_type0  src0; \
+    dst_type    dst; \
+    conv_type1  dst0, dst1; \
+    save_type   out; \
+    float4 src0_f, src1_f; \
+ \
+    float4 total0 = 0, total1 = 0; \
+    float4 rcp_total0 = 0, rcp_total1 = 0; \
+    do \
+    { \
+        VXC_ReadImage(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        coord.y = coord.y + 1; \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = fabs(src0_f - inputZp); \
+        src1_f = fabs(src1_f - inputZp); \
+        total0 = total0 + src0_f; \
+        total1 = total1 + src1_f; \
+    } while (coord.y < axis_size); \
+ \
+    total0 = total0 > epsilon ? total0 : epsilon; \
+    total1 = total1 > epsilon ? total1 : epsilon; \
+    rcp_total0 = 1 / total0 * outputscale; \
+    rcp_total1 = 1 / total1 * outputscale; \
+    coord.y = 0; \
+    do \
+    { \
+        VXC_ReadImage(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = (src0_f - inputZp) * rcp_total0 + outputtail; \
+        src1_f = (src1_f - inputZp) * rcp_total1 + outputtail; \
+        _viv_asm(CONV_RTE, dst0, src0_f); \
+        _viv_asm(CONV_RTE, dst1, src1_f); \
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \
+        _viv_asm(COPY, out, dst, 16); \
+        VXC_WriteImage(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.y = coord.y + 1; \
+    } while (coord.y < axis_size); \
+}
+L1_NORM_AXIS1_SH_2D(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS1_SH_2D(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS1_SH_2D(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS1_SH_2D(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS1_SH_2D(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS1_SH_2D(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS1_SH_2D(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS1_SH_2D(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS1_SH_2D(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS1_SH_2D(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)
+
+
+#define L1_NORM_AXIS1_SH(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \
+__kernel void l1norm_##name0##to##name1##_axis1 \
+    ( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \
+    src_type    v0; \
+    conv_type0  src0; \
+    dst_type    dst; \
+    conv_type1  dst0, dst1; \
+    save_type   out; \
+    float4 src0_f, src1_f; \
+ \
+    float4 total0 = 0, total1 = 0; \
+    float4 rcp_total0 = 0, rcp_total1 = 0; \
+    do \
+    { \
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        coord.y = coord.y + 1; \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = fabs(src0_f - inputZp); \
+        src1_f = fabs(src1_f - inputZp); \
+        total0 = total0 + src0_f; \
+        total1 = total1 + src1_f; \
+    } while (coord.y < axis_size); \
+ \
+    total0 = total0 > epsilon ? total0 : epsilon; \
+    total1 = total1 > epsilon ? total1 : epsilon; \
+    rcp_total0 = 1 / total0 * outputscale; \
+    rcp_total1 = 1 / total1 * outputscale; \
+    coord.y = 0; \
+    do \
+    { \
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = (src0_f - inputZp) * rcp_total0 + outputtail; \
+        src1_f = (src1_f - inputZp) * rcp_total1 + outputtail; \
+        _viv_asm(CONV_RTE, dst0, src0_f); \
+        _viv_asm(CONV_RTE, dst1, src1_f); \
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \
+        _viv_asm(COPY, out, dst, 16); \
+        VXC_WriteImage2DArray(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.y = coord.y + 1; \
+    } while (coord.y < axis_size); \
+}
+L1_NORM_AXIS1_SH(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS1_SH(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS1_SH(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS1_SH(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS1_SH(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS1_SH(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS1_SH(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS1_SH(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS1_SH(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS1_SH(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l1norm_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l1norm_axis2.vx
new file mode 100644
index 0000000..6e7e067
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l1norm_axis2.vx
@@ -0,0 +1,78 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+#define epsilon 1e-12
+
+_viv_uniform VXC_512Bits uniExtract8Bin_2x8;
+_viv_uniform VXC_512Bits ExtractBin_part0_4x4;
+_viv_uniform VXC_512Bits ExtractBin_part1_4x4;
+
+
+#define L1_NORM_AXIS2_SH(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \
+__kernel void l1norm_##name0##to##name1##_axis2 \
+    ( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+                 float            inputZp, \
+                 float            outputscale, \
+                 float            outputtail, \
+                 int              axis, \
+                 int              axis_size) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+    src_type    v0; \
+    conv_type0  src0; \
+    dst_type    dst; \
+    conv_type1  dst0, dst1; \
+    save_type   out; \
+    float4 src0_f, src1_f; \
+ \
+    float4 total0 = 0, total1 = 0; \
+    float4 rcp_total0 = 0, rcp_total1 = 0; \
+    do \
+    { \
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        coord.z = coord.z + 1; \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = fabs(src0_f - inputZp); \
+        src1_f = fabs(src1_f - inputZp); \
+        total0 = total0 + src0_f; \
+        total1 = total1 + src1_f; \
+    } while (coord.z < axis_size); \
+ \
+    total0 = total0 > epsilon ? total0 : epsilon; \
+    total1 = total1 > epsilon ? total1 : epsilon; \
+    rcp_total0 = 1 / total0 * outputscale; \
+    rcp_total1 = 1 / total1 * outputscale; \
+    coord.z = 0; \
+    do \
+    { \
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src0, v0, 16); \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \
+        src0_f = (src0_f - inputZp) * rcp_total0 + outputtail; \
+        src1_f = (src1_f - inputZp) * rcp_total1 + outputtail; \
+        _viv_asm(CONV_RTE, dst0, src0_f); \
+        _viv_asm(CONV_RTE, dst1, src1_f); \
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \
+        _viv_asm(COPY, out, dst, 16); \
+        VXC_WriteImage2DArray(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.z = coord.z + 1; \
+    } while (coord.z < axis_size); \
+}
+L1_NORM_AXIS2_SH(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS2_SH(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS2_SH(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS2_SH(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS2_SH(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)
+L1_NORM_AXIS2_SH(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS2_SH(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)
+L1_NORM_AXIS2_SH(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)
+L1_NORM_AXIS2_SH(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)
+L1_NORM_AXIS2_SH(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_3.vx
index 8c05b02..f8dbd7c 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_3.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_3.vx
@@ -2,7 +2,7 @@
 
 /**************************layernorm float16***********************************/
 _viv_uniform int width;
-_viv_uniform float dimRatio;
+_viv_uniform float inv_multiplier;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;
@@ -46,9 +46,9 @@ __kernel void layer_norm_BF16F32toBF16(
         sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);
     }
     vxc_float mean;
-    mean = sum * dimRatio;
+    mean = sum * inv_multiplier;
     vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
+    vari = sqr*inv_multiplier - mean*mean;
     vari += eps;
     vari = rsqrt(vari);
     vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
@@ -119,9 +119,9 @@ __kernel void layer_norm_BF16F32toBF16_2D(
         sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);
     }
     vxc_float mean;
-    mean = sum * dimRatio;
+    mean = sum * inv_multiplier;
     vxc_float vari;
-    vari = sqr*dimRatio - mean*mean;
+    vari = sqr*inv_multiplier - mean*mean;
     vari += eps;
     vari = rsqrt(vari);
     vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx
index d55fa59..f95de4e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx
@@ -93,6 +93,7 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \
 GEMM_TRANSA_QINT(U8, U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16)
 GEMM_TRANSA_QINT(I8, I8, I8, vxc_char16, vxc_char16, vxc_char16)
 GEMM_TRANSA_QINT(I16, I16, I16, vxc_short8, vxc_short8, vxc_short8)
+GEMM_TRANSA_QINT(U8, I16, I16, vxc_uchar16, vxc_short8, vxc_short8)
 
 #define GEMM_TRANSA_INPUTB_F16(src0_type_name, read0_type) \
 __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8i16_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8i16_i16.vx
new file mode 100644
index 0000000..1a6fd20
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8i16_i16.vx
@@ -0,0 +1,203 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int input0_ZP;
+_viv_uniform int input1_ZP;
+_viv_uniform float output_ZP;
+_viv_uniform float outputScale;
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform int ac2zero;
+_viv_uniform int bc2zero;
+
+#define GEMM_QINT_TO_QINT(src0_type_name, src1_type_name, dst_type_name, read0_type, read1_type, write_type) \
+__kernel void gemm_##src0_type_name##src1_type_name##to##dst_type_name( \
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \
+{ \
+    uint gidy = get_global_id(1); \
+    read0_type srcA; \
+    read1_type srcB; \
+    write_type outC; \
+ \
+    int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \
+    short in0_zp, in1_zp; \
+    _viv_asm(COPY, in0_zp, input0_ZP, 4); \
+    _viv_asm(COPY, in1_zp, input1_ZP, 4); \
+ \
+    int8 inputA_desc, inputB_desc, output_desc; \
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \
+    _viv_asm(MOV, coord_a.w, baseAddr_a);  \
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \
+    _viv_asm(MOV, coord_b.w, baseAddr_b);  \
+ \
+    for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \
+    { \
+        vxc_float4 tempA0, tempA1, tempA2, tempA3; \
+        vxc_float4 tempB0, tempB1, tempB2, tempB3; \
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                    uniConvertUint8SubZpToFp32_4x4); \
+        VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                    uniConvertUint8SubZpToFp32B_4x4); \
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                    uniConvertUint8SubZpToFp32_4x4); \
+        VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                    uniConvertUint8SubZpToFp32B_4x4); \
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                    uniConvertUint8SubZpToFp32_4x4); \
+        VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                    uniConvertUint8SubZpToFp32B_4x4); \
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_a.x += 4; \
+        coord_b.y += 4; \
+        VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                    uniConvertUint8SubZpToFp32_4x4); \
+        VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                    uniConvertUint8SubZpToFp32B_4x4); \
+        sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \
+        sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \
+        sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \
+        sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \
+    } \
+    vxc_int4 tmpOut0, tmpOut1; \
+    coord_b.y = gidy; \
+    coord_b.z = get_global_id(2); \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_b.w, baseAddr); \
+    tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \
+    tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \
+    VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+    coord_b.y++; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+    coord_b.y++; \
+    tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \
+    tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \
+    VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+    coord_b.y++; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+}
+GEMM_QINT_TO_QINT(U8, I16, I16, vxc_uchar8, vxc_short8, vxc_short8)
+
+__kernel void gemm_transb_U8I16toI16(image2d_array_t inputA,
+        image2d_array_t inputB, image2d_array_t output,
+        int transposeA, int transposeB, int adjointA, int adjointB,
+        uint M, uint K, uint N)
+{
+    uint gidy = get_global_id(1);
+    vxc_uchar8 srcA;
+    vxc_short8 srcB;
+    vxc_short8 outC;
+
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
+    short in0_zp, in1_zp;
+    _viv_asm(COPY, in0_zp, input0_ZP, 4);
+    _viv_asm(COPY, in1_zp, input1_ZP, 4);
+
+    int8 inputA_desc, inputB_desc, output_desc;
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+    _viv_asm(MOV, coord_a.w, baseAddr_a);
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+    for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)
+    {
+        vxc_float4 tempA0, tempA1, tempA2, tempA3;
+        vxc_float4 tempB0, tempB1, tempB2, tempB3;
+        vxc_float4 sum_dot0, sum_dot1, sum_dot2, sum_dot3;
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                    uniConvertUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                    uniConvertUint8SubZpToFp32B_4x4);
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                    uniConvertUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                    uniConvertUint8SubZpToFp32B_4x4);
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                    uniConvertUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                    uniConvertUint8SubZpToFp32B_4x4);
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        coord_a.x += 4;
+        coord_b.x += 4;
+        VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                    uniConvertUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),
+                    uniConvertUint8SubZpToFp32B_4x4);
+
+        sum0 = sum0 + (vxc_float4)(dot(tempA0,tempB0),dot(tempA0,tempB1),dot(tempA0,tempB2),dot(tempA0,tempB3));
+        sum1 = sum1 + (vxc_float4)(dot(tempA1,tempB0),dot(tempA1,tempB1),dot(tempA1,tempB2),dot(tempA1,tempB3));
+        sum2 = sum2 + (vxc_float4)(dot(tempA2,tempB0),dot(tempA2,tempB1),dot(tempA2,tempB2),dot(tempA2,tempB3));
+        sum3 = sum3 + (vxc_float4)(dot(tempA3,tempB0),dot(tempA3,tempB1),dot(tempA3,tempB2),dot(tempA3,tempB3));
+    }
+    vxc_int4 tmpOut0, tmpOut1;
+    coord_b.y = gidy;
+    coord_b.z = get_global_id(2);
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP);
+    tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP);
+    VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, outC,
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, outC,
+                VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP);
+    tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP);
+    VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, outC,
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, outC,
+                VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
index 41f1c08..fcc8d9c 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
@@ -30,7 +30,8 @@ __kernel void pre_process_nv12_copy_##name \
                  float           bMean, \
                  float           var, \
                  int             reverse_channel, \
-                 int             trans \
+                 int             trans, \
+                 int             nv_type \
     ) \
 { \
     int gidx = get_global_id(0); \
@@ -45,6 +46,11 @@ __kernel void pre_process_nv12_copy_##name \
  \
     VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
     VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+  \
+    if (nv_type == 1) \
+    { \
+        UV.s0123 = UV.s1032; \
+    } \
  \
     vxc_char16 tmpUV; \
     short tmpVal = 128; \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
index ac6ba3d..f4ac83b 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
@@ -38,7 +38,8 @@ __kernel void pre_process_nv12_scale_##name##_gq \
                  float           bMean, \
                  float           var, \
                  int             reverse_channel, \
-                 int             trans \
+                 int             trans, \
+                 int             nv_type \
     ) \
 { \
     uint4 gidx = get_global_id(0); \
@@ -57,6 +58,11 @@ __kernel void pre_process_nv12_scale_##name##_gq \
     int2 coord_uv = (int2)(uvX.x, uvY); \
     VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
     VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    if (nv_type == 1) \
+    { \
+        UV.s0123456789abcdef = UV.s1032547698badcfe; \
+    } \
  \
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \
     vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \
@@ -128,7 +134,8 @@ __kernel void pre_process_nv12_scale_##name \
                  float           bMean, \
                  float           var, \
                  int             reverse_channel, \
-                 int             trans \
+                 int             trans, \
+                 int             nv_type \
     ) \
 { \
     uint4 gidx = get_global_id(0); \
@@ -161,6 +168,11 @@ __kernel void pre_process_nv12_scale_##name \
     VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
     coord_uv.x = uvX.w; \
     VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    if (nv_type == 1) \
+    { \
+        UV.s01234567 = UV.s10325476; \
+    } \
  \
     vxc_char16 tmpUV; \
     short tmpVal = 128; \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
index f63e65c..eed0715 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
@@ -13,7 +13,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;
 _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
 
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
-_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
+_viv_uniform VXC_512Bits uniExtractYUVtoShortSub_2x8;
 
 #define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
 __kernel void pre_process_yuv422_copy_##name \
@@ -37,7 +37,7 @@ __kernel void pre_process_yuv422_copy_##name \
     int gidy = get_global_id(1); \
  \
     int sy = gidy + (*yOffset); \
-    int sx = gidx + (*xOffset * 2); \
+    int sx = gidx * 2 + (*xOffset * 2); \
  \
     vxc_uchar8 YUV; \
     vxc_short8 tmpYUV; \
@@ -48,11 +48,10 @@ __kernel void pre_process_yuv422_copy_##name \
     { \
         YUV.s01234567 = YUV.s10325476; \
     } \
-\
-    short tmpVal = 128; \
-    VXC_DP2x8(tmpYUV, YUV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
  \
     float4 tmpDstB, tmpDstG, tmpDstR; \
+    vxc_short2 value = (vxc_short2)(128,16); \
+    VXC_DP2x8(tmpYUV, YUV, value, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYUVtoShortSub_2x8); \
     VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \
     VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \
     VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
index ff85f8e..78546d9 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
@@ -17,6 +17,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
 
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 _viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
+_viv_uniform VXC_512Bits uniExtractYtoShortSub16_4x4;
 
 #define uyvy422 1
 
@@ -45,12 +46,13 @@ __kernel void pre_process_yuv422_scale_##name \
     uint dy = (convert_uint(gidy) * yrIntFloat_16) >> 16; \
     uint4 dx = (convert_uint4(gidx) * xrIntFloat_16) >> 16; \
     int sy = convert_int(dy) + (*yOffset); \
-    int4 sx = convert_int4(dx)+ (*xOffset * 2); \
+    int4 sx = (convert_int4(dx)+ *xOffset) * 2; \
  \
     vxc_uchar4 Y; \
     vxc_uchar8 UV; \
+    vxc_short4 tmpY; \
     vxc_char8 tmpUV; \
-    short tmpVal = 128; \
+    short tmpVal = 16; \
     int y_offset = 0; \
     int u_offset = 1; \
     int v_offset = 3; \
@@ -62,19 +64,19 @@ __kernel void pre_process_yuv422_scale_##name \
         v_offset = 2; \
     } \
 \
-    int4 coord_Y = (int4)(sx.x * 2 + y_offset, sy, 0, 0); \
-    int4 coord_U = (int4)((sx.x >> 1) * 4 + u_offset, sy, 0, 0); \
-    int4 coord_V = (int4)((sx.x >> 1) * 4 + v_offset, sy, 0, 0); \
+    int4 coord_Y = (int4)(sx.x + y_offset, sy, 0, 0); \
+    int4 coord_U = (int4)((sx.x >> 1) * 2 + u_offset, sy, 0, 0); \
+    int4 coord_V = (int4)((sx.x >> 1) * 2 + v_offset, sy, 0, 0); \
 \
     VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
-    coord_Y.x = sx.y * 2 + y_offset; \
+    coord_Y.x = sx.y + y_offset; \
     VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
-    coord_Y.x = sx.z * 2 + y_offset; \
+    coord_Y.x = sx.z + y_offset; \
     VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
-    coord_Y.x = sx.w * 2 + y_offset; \
+    coord_Y.x = sx.w + y_offset; \
     VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
  \
-    sx = (sx >> 1) * 4 + u_offset; \
+    sx = (sx >> 1) * 2 + u_offset; \
     VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
     coord_U.x = sx.y; \
     VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
@@ -91,14 +93,16 @@ __kernel void pre_process_yuv422_scale_##name \
     VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
     coord_V.x = sx.w; \
     VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP4x4(tmpY, Y, tmpVal, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_4x4); \
+    tmpVal = 128; \
     VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
     vxc_uchar4 dst_test; \
     VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
 \
     float4 tmpDstB, tmpDstG, tmpDstR; \
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \
+    VXC_DP4x4(tmpDstB, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \
+    VXC_DP4x4(tmpDstG, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \
+    VXC_DP4x4(tmpDstR, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \
  \
     conv_type result; \
     dst_type dst0; \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx
index a9a79d0..5207caf 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_BF16.vx
@@ -51,6 +51,7 @@ __kernel void resize_1d_bilinear_BF16toBF16_DOWN
         coord_in.x = left_x_idx.w;
         VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.x = left_x_idx.x;
 
         vxc_ushort8 src;
         float4 left4;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
index e74328b..8084064 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_F16.vx
@@ -59,6 +59,7 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN
         coord_in.x = left_x_idx.w;
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.x = left_x_idx.x;
 
         _viv_asm(COPY, src_half, src, 16);
 
@@ -124,6 +125,7 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN
         coord_in.x = left_x_idx.w;
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.x = left_x_idx.x;
 
         _viv_asm(COPY, src_half, src, 16);
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx
index 61ff2e9..dc42a78 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I16.vx
@@ -128,6 +128,7 @@ __kernel void resize_1d_bilinear_I16toI16_DOWN
         coord_in.x = left_x_idx.w;
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.x = left_x_idx.x;
 
         VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);
         VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx
index 95a8d5b..956c5e2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_I8.vx
@@ -129,6 +129,7 @@ __kernel void resize_1d_bilinear_I8toI8_DOWN
         coord_in.x = left_x_idx.w;
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.x = left_x_idx.x;
 
         VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);
         VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx
index f7243a6..5392c84 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_1d_bilinear_U8.vx
@@ -27,12 +27,16 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN
     float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
     float4 left_x_f    = floor(in_x);
     float4 x_lerp      = in_x - left_x_f;
+    float4 one_minus_lerp = 1 - x_lerp;
     int4   left_x_idx  = convert_int4(left_x_f);
     vxc_uchar16 src;
     int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
     float4 left4;
     float4 right4;
 
+    x_lerp = x_lerp * uint8Scale;
+    one_minus_lerp = one_minus_lerp * uint8Scale;
+
     int8 input_desc;
     _viv_asm(COPY, input_desc, input, sizeof(input_desc));
     int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
@@ -45,7 +49,6 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN
     baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
     _viv_asm(MOV, coord_out.w, baseAddr);
 
-
     do
     {
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
@@ -59,12 +62,11 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN
         coord_in.x = left_x_idx.w;
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.x = left_x_idx.x;
 
         VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
         VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4);
-        right4      -= left4;
-        float4 dst4  = right4 * x_lerp + left4;
-        dst4 *=  uint8Scale;
+        float4 dst4  = right4 * x_lerp + left4 * one_minus_lerp;
         half4 dst;
         _viv_asm(CONV, dst, dst4);
         vxc_short8 dst_short;
@@ -95,7 +97,6 @@ __kernel void resize_1d_bilinear_U8toU8_UP
     float4 right_x_f   = ceil(in_x);
     int4   right_x_idx = convert_int4(right_x_f);
 
-
     vxc_uchar16 src0, src1;
 
     vxc_uchar16 top;
@@ -161,6 +162,7 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN
     float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;
     float4 left_x_f    = floor(in_x);
     float4 x_lerp      = in_x - left_x_f;
+    float4 one_minus_lerp = 1 - x_lerp;
     int4   left_x_idx  = convert_int4(left_x_f);
     vxc_uchar16 src;
     int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);
@@ -179,6 +181,8 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN
     baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
     _viv_asm(MOV, coord_out.w, baseAddr);
 
+    x_lerp = x_lerp * uint8Scale;
+    one_minus_lerp = one_minus_lerp * uint8Scale;
 
     do
     {
@@ -193,12 +197,11 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN
         coord_in.x = left_x_idx.w;
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.x = left_x_idx.x;
 
         VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
         VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4);
-        right4      -= left4;
-        float4 dst4  = right4 * x_lerp + left4;
-        dst4         = dst4 * uint8Scale + output_ZP;
+        float4 dst4  = right4 * x_lerp + left4 * one_minus_lerp + output_ZP;
         int4 dst     = convert_int4_rte(dst4);
 
         VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_align_corners.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_align_corners.vx
new file mode 100644
index 0000000..ec6d0c2
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_align_corners.vx
@@ -0,0 +1,94 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniBilinear_8x_l10_4x8;
+_viv_uniform VXC_512Bits uniBilinear_8x_l11_4x8;
+_viv_uniform VXC_512Bits uniBilinear_8x_l20_4x8;
+_viv_uniform VXC_512Bits uniBilinear_8x_l21_4x8;
+_viv_uniform VXC_512Bits uniBilinear_8x_l30_4x8;
+_viv_uniform VXC_512Bits uniBilinear_8x_l31_4x8;
+_viv_uniform VXC_512Bits uniBilinear_8x_l40_4x8;
+_viv_uniform VXC_512Bits uniBilinear_8x_l41_4x8;
+__kernel void resize_bilinear_U8toU8_SAME_8x_upsample_align_corners
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+
+
+    vxc_uchar16 in0, in1, dst;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord.xy = coord.xy << 3;
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
+
+    VXC_DP4x8(dst, in0, in0, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l10_4x8);
+    VXC_DP4x8(dst, in0, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l11_4x8);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord.y ++;
+
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l10_4x8);
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l11_4x8);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord.y ++;
+
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l20_4x8);
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l21_4x8);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord.y ++;
+
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l30_4x8);
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l31_4x8);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord.y ++;
+
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l40_4x8);
+    VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l41_4x8);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord.y ++;
+
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l30_4x8);
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l31_4x8);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord.y ++;
+
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l20_4x8);
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l21_4x8);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord.y ++;
+
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(0, 7,  0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l10_4x8);
+    VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l11_4x8);
+
+    VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,
+        VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx
new file mode 100644
index 0000000..1118356
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_special.vx
@@ -0,0 +1,95 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+
+_viv_uniform int update_width;
+_viv_uniform int output_width;
+
+_viv_uniform int offsetX;
+_viv_uniform int offsetY;
+_viv_uniform int offsetZ;
+_viv_uniform int offsetW;
+_viv_uniform int offset_idx;
+
+#define SCATTER_ND_UPDATE_REF2OUT_8BITS(src0_type, data_type) \
+__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \
+    __read_only image2d_t   input_ref, \
+    image2d_t  temp_ref, \
+    image2d_t  output0 \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img0 = create_image_from_image2d(input_ref, 1); \
+    Image img1 = create_image_from_image2d(temp_ref, 1); \
+    __global data_type* in_ptr = (__global data_type*)img0.ptr; \
+    __global data_type* out_ptr = (__global data_type*)img1.ptr; \
+    data_type src, dst; \
+    src = in_ptr[gidx]; \
+    vxc_ushort8 mp0; \
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \
+    VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Lo_2x8); \
+    VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift0_Hi_2x8); \
+    out_ptr[gidx] = dst; \
+}
+SCATTER_ND_UPDATE_REF2OUT_8BITS(U8,  vxc_uchar16)
+SCATTER_ND_UPDATE_REF2OUT_8BITS(I8,  vxc_char16)
+
+#define SCATTER_ND_UPDATE_UPDATE2REF_8BITS(src0_type, data_type) \
+__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \
+    __read_only image2d_t   input_index, \
+    __read_only image2d_t   input_update, \
+    image2d_t  temp_ref, \
+    image2d_t  input0, \
+    image2d_t  output1, \
+    int width, int area, int vol, int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    Image img1 = create_image_from_image2d(input_index, 4); \
+    Image img2 = create_image_from_image2d(input_update, 1); \
+    Image img3 = create_image_from_image2d(temp_ref, 1); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global data_type* update_ptr = (__global data_type*)img2.ptr; \
+    __global data_type* output_ptr = (__global data_type*)img3.ptr; \
+    data_type dst; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \
+    data_type src = update_ptr[gidy * update_width + gidx]; \
+    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
+    int loc = idx * output_width + gidx; \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Lo_2x8); \
+    VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift1_Hi_2x8); \
+    output_ptr[loc] = dst; \
+}
+SCATTER_ND_UPDATE_UPDATE2REF_8BITS(U8,  vxc_uchar16)
+SCATTER_ND_UPDATE_UPDATE2REF_8BITS(I8,  vxc_char16)
+
+#define SCATTER_ND_UPDATE_COPY2OUT(src0_type, data_type, element_size) \
+__kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \
+    __read_only image2d_t   temp_ref, \
+    image2d_t  input1, \
+    image2d_t  output \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img0 = create_image_from_image2d(temp_ref, element_size); \
+    Image img1 = create_image_from_image2d(output, element_size); \
+    __global data_type* in_ptr = (__global data_type*)img0.ptr; \
+    __global data_type* out_ptr = (__global data_type*)img1.ptr; \
+    out_ptr[gidx] = in_ptr[gidx]; \
+}
+SCATTER_ND_UPDATE_COPY2OUT(U8,  vxc_uchar16, 1)
+SCATTER_ND_UPDATE_COPY2OUT(I8,  vxc_char16, 1)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx
deleted file mode 100644
index 52f51e5..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_extra_ending.vx
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-__kernel void extra_ending_I16
-    (
-    __read_only image2d_array_t   input0,
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 data;
-    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void extra_ending_F16
-    (
-    __read_only image2d_array_t   input0,
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_short8 data;
-    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void extra_ending_I8
-    (
-    __read_only image2d_array_t   input0,
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_char8 data;
-    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void extra_ending_U8
-    (
-    __read_only image2d_array_t   input0,
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output
-    )
-{
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-
-    vxc_uchar8 data;
-    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index fe52f46..f528ccb 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -2139,6 +2139,1103 @@ BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char1
 BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)\n\
 "; /* end of batchnorm_single_f32_vx*/
 
+static const char bilinear_grid_sample_BF16_to_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;\n\
+_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;\n\
+\n\
+#define GRID_SAMPLE_BF16_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 top; \\\n\
+    vxc_short8 bottom; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    vxc_ushort8 tmp, dst; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\
+        _viv_asm(COPY, right4, src, 16); \\\n\
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\
+        _viv_asm(COPY, left4,  src, 16); \\\n\
+        right4    -= left4; \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\
+        _viv_asm(COPY, right4, src, 16); \\\n\
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\
+        _viv_asm(COPY, left4,  src, 16); \\\n\
+        right4    -= left4; \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        _viv_asm(COPY, tmp, dst4, 16); \\\n\
+        dst.s0123 = tmp.s1357; \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\
+    _viv_asm(COPY, right4, src, 16); \\\n\
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\
+    _viv_asm(COPY, left4,  src, 16); \\\n\
+    right4    -= left4; \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \\\n\
+    _viv_asm(COPY, right4, src, 16); \\\n\
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \\\n\
+    _viv_asm(COPY, left4,  src, 16); \\\n\
+    right4    -= left4; \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    _viv_asm(COPY, tmp, dst4, 16); \\\n\
+    dst.s0123 = tmp.s1357; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_BF16_BF16toBF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    vxc_short8 src;\n\
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);\n\
+    _viv_asm(COPY, fxy0, src, 16);\n\
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);\n\
+    _viv_asm(COPY, fxy1, src, 16);\n\
+\n\
+\n\
+\n\
+    GRID_SAMPLE_BF16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of bilinear_grid_sample_BF16_to_BF16_vx*/
+
+static const char bilinear_grid_sample_F16_to_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;\n\
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;\n\
+\n\
+#define GRID_SAMPLE_F16_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 t0; \\\n\
+    vxc_short8 b0; \\\n\
+    vxc_half8 top; \\\n\
+    vxc_half8 bottom; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, top, t0, 16); \\\n\
+    _viv_asm(COPY, bottom, b0, 16); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        half4 tmp; \\\n\
+        _viv_asm(CONV, tmp, dst4); \\\n\
+        VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \\\n\
+        vxc_short4 result; \\\n\
+        _viv_asm(COPY, result, top, 8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, top, t0, 16); \\\n\
+        _viv_asm(COPY, bottom, b0, 16); \\\n\
+    } \\\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    half4 tmp; \\\n\
+    _viv_asm(CONV, tmp, dst4); \\\n\
+    VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \\\n\
+    vxc_short4 result; \\\n\
+    _viv_asm(COPY, result, top, 8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_F16_F32toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    coord_in1.z  = coord_in1.z + 4;\n\
+\n\
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+\n\
+    GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+__kernel void bilinear_grid_sample_F16_U8toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    vxc_uchar16 read_coord;\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+    unsigned char input1ZP;\n\
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+    fxy0 = fxy0 * input1Scale;\n\
+    fxy1 = fxy1 * input1Scale;\n\
+\n\
+    GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void bilinear_grid_sample_F16_F16toF16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_half8  read_coord;\n\
+\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+    GRID_SAMPLE_F16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of bilinear_grid_sample_F16_to_F16_vx*/
+
+static const char bilinear_grid_sample_F16_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform float uint8Scale;\n\
+_viv_uniform float output_ZP;\n\
+\n\
+#define GRID_SAMPLE_F16_to_U8_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 t0; \\\n\
+    vxc_short8 b0; \\\n\
+    vxc_uchar16 result; \\\n\
+    vxc_half8 top; \\\n\
+    vxc_half8 bottom; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, top, t0, 16); \\\n\
+    _viv_asm(COPY, bottom, b0, 16); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+        int4 dst     = convert_int4_rte(dst4); \\\n\
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+        result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, top, t0, 16); \\\n\
+        _viv_asm(COPY, bottom, b0, 16); \\\n\
+    } \\\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+    int4 dst     = convert_int4_rte(dst4); \\\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+    result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_F16_F32toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    coord_in1.z  = coord_in1.z + 4;\n\
+\n\
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+    GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_F16_U8toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_uchar16 read_coord;\n\
+\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    unsigned char input1ZP;\n\
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1Scale;\n\
+    fxy1 = fxy1 * input1Scale;\n\
+\n\
+    GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void bilinear_grid_sample_F16_F16toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_half8  read_coord;\n\
+\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+    GRID_SAMPLE_F16_to_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+"; /* end of bilinear_grid_sample_F16_to_U8_vx*/
+
+static const char bilinear_grid_sample_I16_to_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform float dfpScale;\n\
+\n\
+#define GRID_SAMPLE_I16_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_short8 top; \\\n\
+    vxc_short8 bottom; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        dst4         = dst4 * dfpScale; \\\n\
+        int4 dst     = convert_int4_rte(dst4); \\\n\
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    dst4         = dst4 * dfpScale; \\\n\
+    int4 dst     = convert_int4_rte(dst4); \\\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_I16_I16toI16(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    vxc_short8 read_coord;\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1_scale;\n\
+    fxy1 = fxy1 * input1_scale;\n\
+\n\
+    GRID_SAMPLE_I16_PROCESS();\n\
+\n\
+}\n\
+"; /* end of bilinear_grid_sample_I16_to_I16_vx*/
+
+static const char bilinear_grid_sample_I8_to_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;\n\
+_viv_uniform float input1_scale;\n\
+_viv_uniform float dfpScale;\n\
+\n\
+#define GRID_SAMPLE_I8_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_char16 top; \\\n\
+    vxc_char16 bottom; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        dst4         = dst4 * dfpScale; \\\n\
+        int4 dst     = convert_int4_rte(dst4); \\\n\
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    dst4         = dst4 * dfpScale; \\\n\
+    int4 dst     = convert_int4_rte(dst4); \\\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_I8_I8toI8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    vxc_char16 read_coord;\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1_scale;\n\
+    fxy1 = fxy1 * input1_scale;\n\
+\n\
+    GRID_SAMPLE_I8_PROCESS();\n\
+\n\
+}\n\
+"; /* end of bilinear_grid_sample_I8_to_I8_vx*/
+
+static const char bilinear_grid_sample_U8_to_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float2 half_input0_wh;\n\
+_viv_uniform float2 add_float_value;\n\
+_viv_uniform int depth;\n\
+\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform int input_ZP;\n\
+_viv_uniform float uint8Scale;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float input1Scale;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;\n\
+\n\
+\n\
+#define GRID_SAMPLE_U8_PROCESS() \\\n\
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \\\n\
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \\\n\
+    float4 x_f         = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - x_f; \\\n\
+    int4   x_idx       = convert_int4(x_f); \\\n\
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \\\n\
+    float4  y_f         = floor(in_y); \\\n\
+    float4  y_lerp       = in_y - y_f; \\\n\
+    int4    y_idx        = convert_int4(y_f); \\\n\
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \\\n\
+    int baseAddr = input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    vxc_uchar16 top; \\\n\
+    vxc_uchar16 bottom; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.y; \\\n\
+    coord_in.y = y_idx.y; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.z; \\\n\
+    coord_in.y = y_idx.z; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = x_idx.w; \\\n\
+    coord_in.y = y_idx.w; \\\n\
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    float4 left4; \\\n\
+    float4 right4; \\\n\
+    float4 top4; \\\n\
+    float4 bottom4; \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+    int loop = depth - 1; \\\n\
+    while (coord_in.z < loop) \\\n\
+    { \\\n\
+        unsigned char inputZP; \\\n\
+        _viv_asm(COPY, inputZP, input_ZP, 4); \\\n\
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\
+        top4        = right4 * x_lerp + left4; \\\n\
+        VXC_DP4x4(left4, bottom, inputZP, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\
+        VXC_DP4x4(right4, bottom, bottom, \\\n\
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\
+        bottom4      = right4 * x_lerp + left4; \\\n\
+        bottom4     -= top4; \\\n\
+        float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+        dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+        int4 dst     = convert_int4_rte(dst4); \\\n\
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \\\n\
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \\\n\
+        coord_in.x = x_idx.x; \\\n\
+        coord_in.y = y_idx.x; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.y; \\\n\
+        coord_in.y = y_idx.y; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.z; \\\n\
+        coord_in.y = y_idx.z; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.x = x_idx.w; \\\n\
+        coord_in.y = y_idx.w; \\\n\
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \\\n\
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    unsigned char inputZP; \\\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4); \\\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, inputZP, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+    int4 dst     = convert_int4_rte(dst4); \\\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_U8_F32toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+    coord_in1.z  = coord_in1.z + 4;\n\
+\n\
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);\n\
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);\n\
+    GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_U8_U8toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_uchar16 read_coord;\n\
+\n\
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    unsigned char input1ZP;\n\
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);\n\
+\n\
+    fxy0 = fxy0 * input1Scale;\n\
+    fxy1 = fxy1 * input1Scale;\n\
+\n\
+    GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;\n\
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;\n\
+\n\
+__kernel void bilinear_grid_sample_U8_F16toU8(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int align_corners)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_in1 = coord_out.xyxy;\n\
+\n\
+    coord_in1.xz = coord_in1.xz * 2;\n\
+\n\
+    vxc_short8 read_val;\n\
+    vxc_half8  read_coord;\n\
+\n\
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, read_coord, read_val, 16);\n\
+\n\
+    float4 fxy0;\n\
+    float4 fxy1;\n\
+\n\
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);\n\
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);\n\
+\n\
+    GRID_SAMPLE_U8_PROCESS();\n\
+\n\
+}\n\
+\n\
+"; /* end of bilinear_grid_sample_U8_to_U8_vx*/
+
 static const char bucketize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\
@@ -6058,6 +7155,15 @@ float4 eltwise_unary_hard_gelu(float4 x)\n\
     return x * cdf;\n\
 }\n\
 \n\
+float4 eltwise_unary_inverse_sigmoid(float4 x)\n\
+{\n\
+    float4 x1, x2;\n\
+    x = clamp(x, 0, 1);\n\
+    x1 = clamp(x, alpha, 1);\n\
+    x2 = clamp((1 - x), alpha, 1);\n\
+    return log(x1 / x2);\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -6157,6 +7263,17 @@ ELTSISE_UNARY_2D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8,
 ELTSISE_UNARY_2D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_2D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_2D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//INVERSE_SIGMOID\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(inverse_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -6201,7 +7318,8 @@ ELTSISE_UNARY_BF16_2D(round)\n\
 ELTSISE_UNARY_BF16_2D(gelu)\n\
 //HARD_GELU\n\
 ELTSISE_UNARY_BF16_2D(hard_gelu)\n\
-"; /* end of eltwise_unary_2d_0_vx*/
+//INVERSE_SIGMOID\n\
+ELTSISE_UNARY_BF16_2D(inverse_sigmoid)"; /* end of eltwise_unary_2d_0_vx*/
 
 static const char eltwise_unary_2d_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -6271,6 +7389,21 @@ float4 eltwise_unary_softsign(float4 val)\n\
     return val * _rcp;\n\
 }\n\
 \n\
+float4 eltwise_unary_atan(float4 val)\n\
+{\n\
+    return atan(val);\n\
+}\n\
+\n\
+float4 eltwise_unary_atanh(float4 val)\n\
+{\n\
+    return atanh(val);\n\
+}\n\
+\n\
+float4 eltwise_unary_acosh(float4 val)\n\
+{\n\
+    return acosh(val);\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -6381,7 +7514,15 @@ ADD_ELTSISE_UNARY_2D(rcp)\n\
 //SIGN\n\
 ADD_ELTSISE_UNARY_2D(sign)\n\
 //SOFTSIGN\n\
-ADD_ELTSISE_UNARY_2D(softsign)"; /* end of eltwise_unary_2d_1_vx*/
+ADD_ELTSISE_UNARY_2D(softsign)\n\
+//ATAN\n\
+ADD_ELTSISE_UNARY_2D(atan)\n\
+//ATANH\n\
+ADD_ELTSISE_UNARY_2D(atanh)\n\
+//ACOSH\n\
+ADD_ELTSISE_UNARY_2D(acosh)\n\
+\n\
+"; /* end of eltwise_unary_2d_1_vx*/
 
 static const char eltwise_unary_3d_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -6486,6 +7627,15 @@ float4 eltwise_unary_hard_gelu(float4 x)\n\
     return x * cdf;\n\
 }\n\
 \n\
+float4 eltwise_unary_inverse_sigmoid(float4 x)\n\
+{\n\
+    float4 x1, x2;\n\
+    x = clamp(x, 0, 1);\n\
+    x1 = clamp(x, alpha, 1);\n\
+    x2 = clamp((1 - x), alpha, 1);\n\
+    return log(x1 / x2);\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -6585,6 +7735,17 @@ ELTSISE_UNARY_3D(hard_gelu, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8,
 ELTSISE_UNARY_3D(hard_gelu, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_3D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_3D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//INVERSE_SIGMOID\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(inverse_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -6627,7 +7788,9 @@ ELTSISE_UNARY_BF16(round)\n\
 //GELU\n\
 ELTSISE_UNARY_BF16(gelu)\n\
 //HARD_GELU\n\
-ELTSISE_UNARY_BF16(hard_gelu)"; /* end of eltwise_unary_3d_0_vx*/
+ELTSISE_UNARY_BF16(hard_gelu)\n\
+//INVERSE_SIGMOID\n\
+ELTSISE_UNARY_BF16(inverse_sigmoid)"; /* end of eltwise_unary_3d_0_vx*/
 
 static const char eltwise_unary_3d_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -6697,6 +7860,21 @@ float4 eltwise_unary_softsign(float4 val)\n\
     return val * _rcp;\n\
 }\n\
 \n\
+float4 eltwise_unary_atan(float4 val)\n\
+{\n\
+    return atan(val);\n\
+}\n\
+\n\
+float4 eltwise_unary_atanh(float4 val)\n\
+{\n\
+    return atanh(val);\n\
+}\n\
+\n\
+float4 eltwise_unary_acosh(float4 val)\n\
+{\n\
+    return acosh(val);\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -6807,6 +7985,12 @@ ADD_ELTSISE_UNARY_3D(rcp)\n\
 ADD_ELTSISE_UNARY_3D(sign)\n\
 //SOFTSIGN\n\
 ADD_ELTSISE_UNARY_3D(softsign)\n\
+//ATAN\n\
+ADD_ELTSISE_UNARY_3D(atan)\n\
+//ATANH\n\
+ADD_ELTSISE_UNARY_3D(atanh)\n\
+//ACOSH\n\
+ADD_ELTSISE_UNARY_3D(acosh)\n\
 "; /* end of eltwise_unary_3d_1_vx*/
 
 static const char erf_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -8895,6 +10079,202 @@ GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8)\n\
 \n\
 "; /* end of gather_nd_3d_mix_vx*/
 
+static const char gather_nd_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+__kernel void gather_nd_batch_I8toI8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.z = indice.x * block_size + gidx;\n\
+\n\
+    vxc_char16 src;\n\
+    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_batch_U8toU8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.z = indice.x * block_size + gidx;\n\
+\n\
+    vxc_uchar16 src;\n\
+    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_batch_I16toI16_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.z = indice.x * block_size + gidx;\n\
+\n\
+    vxc_short8 src;\n\
+    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_batch_F16toF16_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.z = indice.x * block_size + gidx;\n\
+\n\
+    vxc_short8 src;\n\
+    VXC_ReadImage(src, input0, coord.zy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.xy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of gather_nd_batch_vx*/
+
+static const char gather_nd_batch_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+__kernel void gather_nd_batch_I8toI8_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, 0, gidy, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.zw = coord.zw;\n\
+\n\
+    vxc_char16 src;\n\
+    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_U8toU8_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, 0, gidy, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.zw = coord.zw;\n\
+\n\
+    vxc_uchar16 src;\n\
+    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_I16toI16_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, 0, gidy, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.zw = coord.zw;\n\
+\n\
+    vxc_short8 src;\n\
+    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_F16toF16_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, 0, gidy, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.wz);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.zw = coord.zw;\n\
+\n\
+    vxc_short8 src;\n\
+    VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.xz, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of gather_nd_batch_2d_vx*/
+
 static const char gather_nd_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniConvertInt8toFp16_2x8;\n\
@@ -12472,6 +13852,403 @@ __kernel void instance_norm_BF16_F32toBF16_2D(\n\
     }\n\
 }"; /* end of instance_normalization_3_vx*/
 
+static const char l1norm_axis0_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define epsilon 1e-12\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract8Bin_2x8;\n\
+_viv_uniform VXC_512Bits ExtractBin_part0_4x4;\n\
+_viv_uniform VXC_512Bits ExtractBin_part1_4x4;\n\
+\n\
+\n\
+\n\
+#define L1_NORM_AXIS0_SH_2D(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \\\n\
+__kernel void l1norm_##name0##to##name1##_2D_axis0 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int2 coord = (int2)(0, get_global_id(0)); \\\n\
+    src_type    v0, v1; \\\n\
+    conv_type0  src0, src1; \\\n\
+    conv_type1  dst0, dst1; \\\n\
+    dst_type    dst; \\\n\
+    save_type   out; \\\n\
+    float4 src0_f, src1_f, src2_f, src3_f; \\\n\
+ \\\n\
+    float4 sum = 0; \\\n\
+    float4 total = 0; \\\n\
+    float4 rcp_total = 0; \\\n\
+    half4  rcp_total_half = 0; \\\n\
+    float4 one4 = (float4)(1.0f, 1.0f, 1.0f, 1.0f); \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        VXC_ReadImage(v1, input, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src1, v1, 16); \\\n\
+        coord.x = coord.x + 16; \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src2_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        VXC_DP4x4(src3_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = fabs(src0_f - inputZp); \\\n\
+        src1_f = fabs(src1_f - inputZp); \\\n\
+        src2_f = fabs(src2_f - inputZp); \\\n\
+        src3_f = fabs(src3_f - inputZp); \\\n\
+        sum = src0_f + src1_f + src2_f + src3_f; \\\n\
+        total = total + dot(sum, one4); \\\n\
+    } while (coord.x < axis_size); \\\n\
+ \\\n\
+    total = total > epsilon ? total : epsilon; \\\n\
+    rcp_total = 1 / total * outputscale; \\\n\
+    coord.x = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = (src0_f - inputZp) * rcp_total.x + outputtail; \\\n\
+        src1_f = (src1_f - inputZp) * rcp_total.x + outputtail; \\\n\
+        _viv_asm(CONV_RTE, dst0, src0_f); \\\n\
+        _viv_asm(CONV_RTE, dst1, src1_f); \\\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \\\n\
+        _viv_asm(COPY, out, dst, 16); \\\n\
+        VXC_WriteImage(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x = coord.x + 8; \\\n\
+    } while (coord.x < axis_size); \\\n\
+}\n\
+L1_NORM_AXIS0_SH_2D(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS0_SH_2D(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS0_SH_2D(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS0_SH_2D(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS0_SH_2D(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS0_SH_2D(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS0_SH_2D(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS0_SH_2D(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS0_SH_2D(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS0_SH_2D(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)\n\
+\n\
+\n\
+#define L1_NORM_AXIS0_SH(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \\\n\
+__kernel void l1norm_##name0##to##name1##_axis0 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(0), get_global_id(1), 0); \\\n\
+    src_type    v0, v1; \\\n\
+    conv_type0  src0, src1; \\\n\
+    conv_type1  dst0, dst1; \\\n\
+    dst_type    dst; \\\n\
+    save_type   out; \\\n\
+    float4 src0_f, src1_f, src2_f, src3_f; \\\n\
+ \\\n\
+    float4 sum = 0; \\\n\
+    float4 total = 0; \\\n\
+    float4 rcp_total = 0; \\\n\
+    half4  rcp_total_half = 0; \\\n\
+    float4 one4 = (float4)(1.0f, 1.0f, 1.0f, 1.0f); \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        VXC_ReadImage2DArray(v1, input, coord, VXC_5BITOFFSET_XY(8, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src1, v1, 16); \\\n\
+        coord.x = coord.x + 16; \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src2_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        VXC_DP4x4(src3_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = fabs(src0_f - inputZp); \\\n\
+        src1_f = fabs(src1_f - inputZp); \\\n\
+        src2_f = fabs(src2_f - inputZp); \\\n\
+        src3_f = fabs(src3_f - inputZp); \\\n\
+        sum = src0_f + src1_f + src2_f + src3_f; \\\n\
+        total = total + dot(sum, one4); \\\n\
+    } while (coord.x < axis_size); \\\n\
+ \\\n\
+    total = total > epsilon ? total : epsilon; \\\n\
+    rcp_total = 1 / total * outputscale; \\\n\
+    coord.x = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = (src0_f - inputZp) * rcp_total.x + outputtail; \\\n\
+        src1_f = (src1_f - inputZp) * rcp_total.x + outputtail; \\\n\
+        _viv_asm(CONV_RTE, dst0, src0_f); \\\n\
+        _viv_asm(CONV_RTE, dst1, src1_f); \\\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \\\n\
+        _viv_asm(COPY, out, dst, 16); \\\n\
+        VXC_WriteImage2DArray(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x = coord.x + 8; \\\n\
+    } while (coord.x < axis_size); \\\n\
+}\n\
+L1_NORM_AXIS0_SH(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS0_SH(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS0_SH(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS0_SH(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS0_SH(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS0_SH(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS0_SH(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS0_SH(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS0_SH(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS0_SH(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)\n\
+\n\
+\n\
+"; /* end of l1norm_axis0_vx*/
+
+static const char l1norm_axis1_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define epsilon 1e-12\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract8Bin_2x8;\n\
+_viv_uniform VXC_512Bits ExtractBin_part0_4x4;\n\
+_viv_uniform VXC_512Bits ExtractBin_part1_4x4;\n\
+\n\
+\n\
+#define L1_NORM_AXIS1_SH_2D(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \\\n\
+__kernel void l1norm_##name0##to##name1##_2D_axis1 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0),0); \\\n\
+    src_type    v0; \\\n\
+    conv_type0  src0; \\\n\
+    dst_type    dst; \\\n\
+    conv_type1  dst0, dst1; \\\n\
+    save_type   out; \\\n\
+    float4 src0_f, src1_f; \\\n\
+ \\\n\
+    float4 total0 = 0, total1 = 0; \\\n\
+    float4 rcp_total0 = 0, rcp_total1 = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        coord.y = coord.y + 1; \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = fabs(src0_f - inputZp); \\\n\
+        src1_f = fabs(src1_f - inputZp); \\\n\
+        total0 = total0 + src0_f; \\\n\
+        total1 = total1 + src1_f; \\\n\
+    } while (coord.y < axis_size); \\\n\
+ \\\n\
+    total0 = total0 > epsilon ? total0 : epsilon; \\\n\
+    total1 = total1 > epsilon ? total1 : epsilon; \\\n\
+    rcp_total0 = 1 / total0 * outputscale; \\\n\
+    rcp_total1 = 1 / total1 * outputscale; \\\n\
+    coord.y = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = (src0_f - inputZp) * rcp_total0 + outputtail; \\\n\
+        src1_f = (src1_f - inputZp) * rcp_total1 + outputtail; \\\n\
+        _viv_asm(CONV_RTE, dst0, src0_f); \\\n\
+        _viv_asm(CONV_RTE, dst1, src1_f); \\\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \\\n\
+        _viv_asm(COPY, out, dst, 16); \\\n\
+        VXC_WriteImage(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.y = coord.y + 1; \\\n\
+    } while (coord.y < axis_size); \\\n\
+}\n\
+L1_NORM_AXIS1_SH_2D(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS1_SH_2D(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS1_SH_2D(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS1_SH_2D(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS1_SH_2D(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS1_SH_2D(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS1_SH_2D(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS1_SH_2D(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS1_SH_2D(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS1_SH_2D(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)\n\
+\n\
+\n\
+#define L1_NORM_AXIS1_SH(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \\\n\
+__kernel void l1norm_##name0##to##name1##_axis1 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(1), 0); \\\n\
+    src_type    v0; \\\n\
+    conv_type0  src0; \\\n\
+    dst_type    dst; \\\n\
+    conv_type1  dst0, dst1; \\\n\
+    save_type   out; \\\n\
+    float4 src0_f, src1_f; \\\n\
+ \\\n\
+    float4 total0 = 0, total1 = 0; \\\n\
+    float4 rcp_total0 = 0, rcp_total1 = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        coord.y = coord.y + 1; \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = fabs(src0_f - inputZp); \\\n\
+        src1_f = fabs(src1_f - inputZp); \\\n\
+        total0 = total0 + src0_f; \\\n\
+        total1 = total1 + src1_f; \\\n\
+    } while (coord.y < axis_size); \\\n\
+ \\\n\
+    total0 = total0 > epsilon ? total0 : epsilon; \\\n\
+    total1 = total1 > epsilon ? total1 : epsilon; \\\n\
+    rcp_total0 = 1 / total0 * outputscale; \\\n\
+    rcp_total1 = 1 / total1 * outputscale; \\\n\
+    coord.y = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = (src0_f - inputZp) * rcp_total0 + outputtail; \\\n\
+        src1_f = (src1_f - inputZp) * rcp_total1 + outputtail; \\\n\
+        _viv_asm(CONV_RTE, dst0, src0_f); \\\n\
+        _viv_asm(CONV_RTE, dst1, src1_f); \\\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \\\n\
+        _viv_asm(COPY, out, dst, 16); \\\n\
+        VXC_WriteImage2DArray(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.y = coord.y + 1; \\\n\
+    } while (coord.y < axis_size); \\\n\
+}\n\
+L1_NORM_AXIS1_SH(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS1_SH(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS1_SH(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS1_SH(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS1_SH(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS1_SH(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS1_SH(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS1_SH(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS1_SH(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS1_SH(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)\n\
+"; /* end of l1norm_axis1_vx*/
+
+static const char l1norm_axis2_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define epsilon 1e-12\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract8Bin_2x8;\n\
+_viv_uniform VXC_512Bits ExtractBin_part0_4x4;\n\
+_viv_uniform VXC_512Bits ExtractBin_part1_4x4;\n\
+\n\
+\n\
+#define L1_NORM_AXIS2_SH(name0, name1, src_type, conv_type0, conv_type1, dst_type, save_type) \\\n\
+__kernel void l1norm_##name0##to##name1##_axis2 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+    src_type    v0; \\\n\
+    conv_type0  src0; \\\n\
+    dst_type    dst; \\\n\
+    conv_type1  dst0, dst1; \\\n\
+    save_type   out; \\\n\
+    float4 src0_f, src1_f; \\\n\
+ \\\n\
+    float4 total0 = 0, total1 = 0; \\\n\
+    float4 rcp_total0 = 0, rcp_total1 = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        coord.z = coord.z + 1; \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = fabs(src0_f - inputZp); \\\n\
+        src1_f = fabs(src1_f - inputZp); \\\n\
+        total0 = total0 + src0_f; \\\n\
+        total1 = total1 + src1_f; \\\n\
+    } while (coord.z < axis_size); \\\n\
+ \\\n\
+    total0 = total0 > epsilon ? total0 : epsilon; \\\n\
+    total1 = total1 > epsilon ? total1 : epsilon; \\\n\
+    rcp_total0 = 1 / total0 * outputscale; \\\n\
+    rcp_total1 = 1 / total1 * outputscale; \\\n\
+    coord.z = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(v0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src0, v0, 16); \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part0_4x4); \\\n\
+        VXC_DP4x4(src1_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), ExtractBin_part1_4x4); \\\n\
+        src0_f = (src0_f - inputZp) * rcp_total0 + outputtail; \\\n\
+        src1_f = (src1_f - inputZp) * rcp_total1 + outputtail; \\\n\
+        _viv_asm(CONV_RTE, dst0, src0_f); \\\n\
+        _viv_asm(CONV_RTE, dst1, src1_f); \\\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Bin_2x8); \\\n\
+        _viv_asm(COPY, out, dst, 16); \\\n\
+        VXC_WriteImage2DArray(output, coord, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.z = coord.z + 1; \\\n\
+    } while (coord.z < axis_size); \\\n\
+}\n\
+L1_NORM_AXIS2_SH(U8, F16,vxc_uchar8,vxc_uchar8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS2_SH(U8, U8, vxc_uchar8,vxc_uchar8,short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS2_SH(I8, F16,vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS2_SH(I8, I8, vxc_char8, vxc_char8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS2_SH(I16,F16,vxc_short8,vxc_short8,half4, vxc_half8, vxc_short8)\n\
+L1_NORM_AXIS2_SH(I16,I16,vxc_short8,vxc_short8,short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS2_SH(F16,U8, vxc_short8,vxc_half8, short4,vxc_uchar8,vxc_uchar8)\n\
+L1_NORM_AXIS2_SH(F16,I8, vxc_short8,vxc_half8, short4,vxc_char8, vxc_char8)\n\
+L1_NORM_AXIS2_SH(F16,I16,vxc_short8,vxc_half8, short4,vxc_short8,vxc_short8)\n\
+L1_NORM_AXIS2_SH(F16,F16,vxc_short8,vxc_half8, half4, vxc_half8, vxc_short8)\n\
+"; /* end of l1norm_axis2_vx*/
+
 static const char l2normalizescale_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 #define VXC_Vstore3(Pointer, Offset, Data)   \\\n\
@@ -14284,7 +16061,7 @@ static const char layer_normalization_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 /**************************layernorm float16***********************************/\n\
 _viv_uniform int width;\n\
-_viv_uniform float dimRatio;\n\
+_viv_uniform float inv_multiplier;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
@@ -14328,9 +16105,9 @@ __kernel void layer_norm_BF16F32toBF16(\n\
         sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);\n\
     }\n\
     vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
+    mean = sum * inv_multiplier;\n\
     vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
+    vari = sqr*inv_multiplier - mean*mean;\n\
     vari += eps;\n\
     vari = rsqrt(vari);\n\
     vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
@@ -14401,9 +16178,9 @@ __kernel void layer_norm_BF16F32toBF16_2D(\n\
         sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);\n\
     }\n\
     vxc_float mean;\n\
-    mean = sum * dimRatio;\n\
+    mean = sum * inv_multiplier;\n\
     vxc_float vari;\n\
-    vari = sqr*dimRatio - mean*mean;\n\
+    vari = sqr*inv_multiplier - mean*mean;\n\
     vari += eps;\n\
     vari = rsqrt(vari);\n\
     vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
@@ -21319,6 +23096,7 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \
 GEMM_TRANSA_QINT(U8, U8, U8, vxc_uchar16, vxc_uchar16, vxc_uchar16)\n\
 GEMM_TRANSA_QINT(I8, I8, I8, vxc_char16, vxc_char16, vxc_char16)\n\
 GEMM_TRANSA_QINT(I16, I16, I16, vxc_short8, vxc_short8, vxc_short8)\n\
+GEMM_TRANSA_QINT(U8, I16, I16, vxc_uchar16, vxc_short8, vxc_short8)\n\
 \n\
 #define GEMM_TRANSA_INPUTB_F16(src0_type_name, read0_type) \\\n\
 __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \\\n\
@@ -22687,6 +24465,211 @@ GEMM_QINT_F16_TO_QINT(I16, vxc_short8)\n\
 \n\
 "; /* end of matrixmul_u8f16_u8_vx*/
 
+static const char matrixmul_u8i16_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int input0_ZP;\n\
+_viv_uniform int input1_ZP;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32B_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform int ac2zero;\n\
+_viv_uniform int bc2zero;\n\
+\n\
+#define GEMM_QINT_TO_QINT(src0_type_name, src1_type_name, dst_type_name, read0_type, read1_type, write_type) \\\n\
+__kernel void gemm_##src0_type_name##src1_type_name##to##dst_type_name( \\\n\
+        image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
+        int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N) \\\n\
+{ \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    read0_type srcA; \\\n\
+    read1_type srcB; \\\n\
+    write_type outC; \\\n\
+ \\\n\
+    int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0); \\\n\
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0); \\\n\
+    short in0_zp, in1_zp; \\\n\
+    _viv_asm(COPY, in0_zp, input0_ZP, 4); \\\n\
+    _viv_asm(COPY, in1_zp, input1_ZP, 4); \\\n\
+ \\\n\
+    int8 inputA_desc, inputB_desc, output_desc; \\\n\
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0; \\\n\
+    _viv_asm(MOV, coord_a.w, baseAddr_a);  \\\n\
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc)); \\\n\
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0; \\\n\
+    _viv_asm(MOV, coord_b.w, baseAddr_b);  \\\n\
+ \\\n\
+    for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;) \\\n\
+    { \\\n\
+        vxc_float4 tempA0, tempA1, tempA2, tempA3; \\\n\
+        vxc_float4 tempB0, tempB1, tempB2, tempB3; \\\n\
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniConvertUint8SubZpToFp32_4x4); \\\n\
+        VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniConvertUint8SubZpToFp32B_4x4); \\\n\
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniConvertUint8SubZpToFp32_4x4); \\\n\
+        VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniConvertUint8SubZpToFp32B_4x4); \\\n\
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniConvertUint8SubZpToFp32_4x4); \\\n\
+        VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniConvertUint8SubZpToFp32B_4x4); \\\n\
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_a.x += 4; \\\n\
+        coord_b.y += 4; \\\n\
+        VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniConvertUint8SubZpToFp32_4x4); \\\n\
+        VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                    uniConvertUint8SubZpToFp32B_4x4); \\\n\
+        sum0 = (sum0 + tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3); \\\n\
+        sum1 = (sum1 + tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3); \\\n\
+        sum2 = (sum2 + tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); \\\n\
+        sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\
+    } \\\n\
+    vxc_int4 tmpOut0, tmpOut1; \\\n\
+    coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_b.w, baseAddr); \\\n\
+    tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP); \\\n\
+    tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP); \\\n\
+    VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+    coord_b.y++; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+    coord_b.y++; \\\n\
+    tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP); \\\n\
+    tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP); \\\n\
+    VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+    coord_b.y++; \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GEMM_QINT_TO_QINT(U8, I16, I16, vxc_uchar8, vxc_short8, vxc_short8)\n\
+\n\
+__kernel void gemm_transb_U8I16toI16(image2d_array_t inputA,\n\
+        image2d_array_t inputB, image2d_array_t output,\n\
+        int transposeA, int transposeB, int adjointA, int adjointB,\n\
+        uint M, uint K, uint N)\n\
+{\n\
+    uint gidy = get_global_id(1);\n\
+    vxc_uchar8 srcA;\n\
+    vxc_short8 srcB;\n\
+    vxc_short8 outC;\n\
+\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);\n\
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0);\n\
+    vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);\n\
+    vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);\n\
+    short in0_zp, in1_zp;\n\
+    _viv_asm(COPY, in0_zp, input0_ZP, 4);\n\
+    _viv_asm(COPY, in1_zp, input1_ZP, 4);\n\
+\n\
+    int8 inputA_desc, inputB_desc, output_desc;\n\
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+    _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+    _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+\n\
+    for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\
+    {\n\
+        vxc_float4 tempA0, tempA1, tempA2, tempA3;\n\
+        vxc_float4 tempB0, tempB1, tempB2, tempB3;\n\
+        vxc_float4 sum_dot0, sum_dot1, sum_dot2, sum_dot3;\n\
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP4x4(tempA0, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvertUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tempB0, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvertUint8SubZpToFp32B_4x4);\n\
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP4x4(tempA1, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvertUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tempB1, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvertUint8SubZpToFp32B_4x4);\n\
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP4x4(tempA2, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvertUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tempB2, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvertUint8SubZpToFp32B_4x4);\n\
+        VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        coord_a.x += 4;\n\
+        coord_b.x += 4;\n\
+        VXC_DP4x4(tempA3, srcA, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvertUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tempB3, srcB, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvertUint8SubZpToFp32B_4x4);\n\
+\n\
+        sum0 = sum0 + (vxc_float4)(dot(tempA0,tempB0),dot(tempA0,tempB1),dot(tempA0,tempB2),dot(tempA0,tempB3));\n\
+        sum1 = sum1 + (vxc_float4)(dot(tempA1,tempB0),dot(tempA1,tempB1),dot(tempA1,tempB2),dot(tempA1,tempB3));\n\
+        sum2 = sum2 + (vxc_float4)(dot(tempA2,tempB0),dot(tempA2,tempB1),dot(tempA2,tempB2),dot(tempA2,tempB3));\n\
+        sum3 = sum3 + (vxc_float4)(dot(tempA3,tempB0),dot(tempA3,tempB1),dot(tempA3,tempB2),dot(tempA3,tempB3));\n\
+    }\n\
+    vxc_int4 tmpOut0, tmpOut1;\n\
+    coord_b.y = gidy;\n\
+    coord_b.z = get_global_id(2);\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    tmpOut0 = convert_int4_rte(sum0 * outputScale + output_ZP);\n\
+    tmpOut1 = convert_int4_rte(sum1 * outputScale + output_ZP);\n\
+    VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, outC,\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, outC,\n\
+                VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    tmpOut0 = convert_int4_rte(sum2 * outputScale + output_ZP);\n\
+    tmpOut1 = convert_int4_rte(sum3 * outputScale + output_ZP);\n\
+    VXC_DP2x8(outC, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, outC,\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, outC,\n\
+                VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of matrixmul_u8i16_i16_vx*/
+
 static const char matrixmul_u8u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int input0_ZP;\n\
@@ -28060,7 +30043,8 @@ __kernel void pre_process_nv12_copy_##name \\\n\
                  float           bMean, \\\n\
                  float           var, \\\n\
                  int             reverse_channel, \\\n\
-                 int             trans \\\n\
+                 int             trans, \\\n\
+                 int             nv_type \\\n\
     ) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
@@ -28075,6 +30059,11 @@ __kernel void pre_process_nv12_copy_##name \\\n\
  \\\n\
     VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+  \\\n\
+    if (nv_type == 1) \\\n\
+    { \\\n\
+        UV.s0123 = UV.s1032; \\\n\
+    } \\\n\
  \\\n\
     vxc_char16 tmpUV; \\\n\
     short tmpVal = 128; \\\n\
@@ -28156,7 +30145,8 @@ __kernel void pre_process_nv12_scale_##name##_gq \\\n\
                  float           bMean, \\\n\
                  float           var, \\\n\
                  int             reverse_channel, \\\n\
-                 int             trans \\\n\
+                 int             trans, \\\n\
+                 int             nv_type \\\n\
     ) \\\n\
 { \\\n\
     uint4 gidx = get_global_id(0); \\\n\
@@ -28175,6 +30165,11 @@ __kernel void pre_process_nv12_scale_##name##_gq \\\n\
     int2 coord_uv = (int2)(uvX.x, uvY); \\\n\
     VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    if (nv_type == 1) \\\n\
+    { \\\n\
+        UV.s0123456789abcdef = UV.s1032547698badcfe; \\\n\
+    } \\\n\
  \\\n\
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\
     vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\
@@ -28246,7 +30241,8 @@ __kernel void pre_process_nv12_scale_##name \\\n\
                  float           bMean, \\\n\
                  float           var, \\\n\
                  int             reverse_channel, \\\n\
-                 int             trans \\\n\
+                 int             trans, \\\n\
+                 int             nv_type \\\n\
     ) \\\n\
 { \\\n\
     uint4 gidx = get_global_id(0); \\\n\
@@ -28279,6 +30275,11 @@ __kernel void pre_process_nv12_scale_##name \\\n\
     VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
     coord_uv.x = uvX.w; \\\n\
     VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    if (nv_type == 1) \\\n\
+    { \\\n\
+        UV.s01234567 = UV.s10325476; \\\n\
+    } \\\n\
  \\\n\
     vxc_char16 tmpUV; \\\n\
     short tmpVal = 128; \\\n\
@@ -30678,7 +32679,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;\n\
 _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;\n\
 \n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
-_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractYUVtoShortSub_2x8;\n\
 \n\
 #define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\
 __kernel void pre_process_yuv422_copy_##name \\\n\
@@ -30702,7 +32703,7 @@ __kernel void pre_process_yuv422_copy_##name \\\n\
     int gidy = get_global_id(1); \\\n\
  \\\n\
     int sy = gidy + (*yOffset); \\\n\
-    int sx = gidx + (*xOffset * 2); \\\n\
+    int sx = gidx * 2 + (*xOffset * 2); \\\n\
  \\\n\
     vxc_uchar8 YUV; \\\n\
     vxc_short8 tmpYUV; \\\n\
@@ -30713,11 +32714,10 @@ __kernel void pre_process_yuv422_copy_##name \\\n\
     { \\\n\
         YUV.s01234567 = YUV.s10325476; \\\n\
     } \\\n\
-\\\n\
-    short tmpVal = 128; \\\n\
-    VXC_DP2x8(tmpYUV, YUV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\
  \\\n\
     float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    vxc_short2 value = (vxc_short2)(128,16); \\\n\
+    VXC_DP2x8(tmpYUV, YUV, value, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYUVtoShortSub_2x8); \\\n\
     VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \\\n\
     VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \\\n\
     VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \\\n\
@@ -30772,6 +32772,7 @@ _viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;\n\
 \n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractYtoShortSub16_4x4;\n\
 \n\
 #define uyvy422 1\n\
 \n\
@@ -30800,12 +32801,13 @@ __kernel void pre_process_yuv422_scale_##name \\\n\
     uint dy = (convert_uint(gidy) * yrIntFloat_16) >> 16; \\\n\
     uint4 dx = (convert_uint4(gidx) * xrIntFloat_16) >> 16; \\\n\
     int sy = convert_int(dy) + (*yOffset); \\\n\
-    int4 sx = convert_int4(dx)+ (*xOffset * 2); \\\n\
+    int4 sx = (convert_int4(dx)+ *xOffset) * 2; \\\n\
  \\\n\
     vxc_uchar4 Y; \\\n\
     vxc_uchar8 UV; \\\n\
+    vxc_short4 tmpY; \\\n\
     vxc_char8 tmpUV; \\\n\
-    short tmpVal = 128; \\\n\
+    short tmpVal = 16; \\\n\
     int y_offset = 0; \\\n\
     int u_offset = 1; \\\n\
     int v_offset = 3; \\\n\
@@ -30817,19 +32819,19 @@ __kernel void pre_process_yuv422_scale_##name \\\n\
         v_offset = 2; \\\n\
     } \\\n\
 \\\n\
-    int4 coord_Y = (int4)(sx.x * 2 + y_offset, sy, 0, 0); \\\n\
-    int4 coord_U = (int4)((sx.x >> 1) * 4 + u_offset, sy, 0, 0); \\\n\
-    int4 coord_V = (int4)((sx.x >> 1) * 4 + v_offset, sy, 0, 0); \\\n\
+    int4 coord_Y = (int4)(sx.x + y_offset, sy, 0, 0); \\\n\
+    int4 coord_U = (int4)((sx.x >> 1) * 2 + u_offset, sy, 0, 0); \\\n\
+    int4 coord_V = (int4)((sx.x >> 1) * 2 + v_offset, sy, 0, 0); \\\n\
 \\\n\
     VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_Y.x = sx.y * 2 + y_offset; \\\n\
+    coord_Y.x = sx.y + y_offset; \\\n\
     VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_Y.x = sx.z * 2 + y_offset; \\\n\
+    coord_Y.x = sx.z + y_offset; \\\n\
     VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
-    coord_Y.x = sx.w * 2 + y_offset; \\\n\
+    coord_Y.x = sx.w + y_offset; \\\n\
     VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
-    sx = (sx >> 1) * 4 + u_offset; \\\n\
+    sx = (sx >> 1) * 2 + u_offset; \\\n\
     VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
     coord_U.x = sx.y; \\\n\
     VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -30846,14 +32848,16 @@ __kernel void pre_process_yuv422_scale_##name \\\n\
     VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\
     coord_V.x = sx.w; \\\n\
     VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP4x4(tmpY, Y, tmpVal, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_4x4); \\\n\
+    tmpVal = 128; \\\n\
     VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\
     vxc_uchar4 dst_test; \\\n\
     VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
 \\\n\
     float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \\\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \\\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \\\n\
+    VXC_DP4x4(tmpDstB, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \\\n\
  \\\n\
     conv_type result; \\\n\
     dst_type dst0; \\\n\
@@ -34910,6 +36914,7 @@ __kernel void resize_1d_bilinear_BF16toBF16_DOWN\n\
         coord_in.x = left_x_idx.w;\n\
         VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.x = left_x_idx.x;\n\
 \n\
         vxc_ushort8 src;\n\
         float4 left4;\n\
@@ -35206,6 +37211,7 @@ __kernel void resize_1d_bilinear_F16toF16_DOWN\n\
         coord_in.x = left_x_idx.w;\n\
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.x = left_x_idx.x;\n\
 \n\
         _viv_asm(COPY, src_half, src, 16);\n\
 \n\
@@ -35271,6 +37277,7 @@ __kernel void resize_1d_bilinear_F16toU8_DOWN\n\
         coord_in.x = left_x_idx.w;\n\
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.x = left_x_idx.x;\n\
 \n\
         _viv_asm(COPY, src_half, src, 16);\n\
 \n\
@@ -35493,6 +37500,7 @@ __kernel void resize_1d_bilinear_I16toI16_DOWN\n\
         coord_in.x = left_x_idx.w;\n\
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.x = left_x_idx.x;\n\
 \n\
         VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);\n\
         VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4);\n\
@@ -35643,6 +37651,7 @@ __kernel void resize_1d_bilinear_I8toI8_DOWN\n\
         coord_in.x = left_x_idx.w;\n\
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.x = left_x_idx.x;\n\
 \n\
         VXC_DP4x4(left4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_left_4x4);\n\
         VXC_DP4x4(right4, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDFP2FP32_right_4x4);\n\
@@ -35691,11 +37700,15 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN\n\
     float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\
     float4 left_x_f    = floor(in_x);\n\
     float4 x_lerp      = in_x - left_x_f;\n\
+    float4 one_minus_lerp = 1 - x_lerp;\n\
     int4   left_x_idx  = convert_int4(left_x_f);\n\
     vxc_uchar16 src;\n\
     int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\
     float4 left4;\n\
     float4 right4;\n\
+\n\
+    x_lerp = x_lerp * uint8Scale;\n\
+    one_minus_lerp = one_minus_lerp * uint8Scale;\n\
 \n\
     int8 input_desc;\n\
     _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
@@ -35708,7 +37721,6 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
     baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
     _viv_asm(MOV, coord_out.w, baseAddr);\n\
-\n\
 \n\
     do\n\
     {\n\
@@ -35723,12 +37735,11 @@ __kernel void resize_1d_bilinear_U8toF16_DOWN\n\
         coord_in.x = left_x_idx.w;\n\
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.x = left_x_idx.x;\n\
 \n\
         VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4);\n\
-        right4      -= left4;\n\
-        float4 dst4  = right4 * x_lerp + left4;\n\
-        dst4 *=  uint8Scale;\n\
+        float4 dst4  = right4 * x_lerp + left4 * one_minus_lerp;\n\
         half4 dst;\n\
         _viv_asm(CONV, dst, dst4);\n\
         vxc_short8 dst_short;\n\
@@ -35758,7 +37769,6 @@ __kernel void resize_1d_bilinear_U8toU8_UP\n\
     int4   left_x_idx  = convert_int4(left_x_f);\n\
     float4 right_x_f   = ceil(in_x);\n\
     int4   right_x_idx = convert_int4(right_x_f);\n\
-\n\
 \n\
     vxc_uchar16 src0, src1;\n\
 \n\
@@ -35825,6 +37835,7 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN\n\
     float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_x - half_pixel_value;\n\
     float4 left_x_f    = floor(in_x);\n\
     float4 x_lerp      = in_x - left_x_f;\n\
+    float4 one_minus_lerp = 1 - x_lerp;\n\
     int4   left_x_idx  = convert_int4(left_x_f);\n\
     vxc_uchar16 src;\n\
     int4 coord_in = (int4)(left_x_idx.x, coord_out.y, coord_out.z, 0);\n\
@@ -35843,6 +37854,8 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN\n\
     baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
     _viv_asm(MOV, coord_out.w, baseAddr);\n\
 \n\
+    x_lerp = x_lerp * uint8Scale;\n\
+    one_minus_lerp = one_minus_lerp * uint8Scale;\n\
 \n\
     do\n\
     {\n\
@@ -35857,12 +37870,11 @@ __kernel void resize_1d_bilinear_U8toU8_DOWN\n\
         coord_in.x = left_x_idx.w;\n\
         VXC_OP4(img_load_3d, src, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.x = left_x_idx.x;\n\
 \n\
         VXC_DP4x4(left4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, src, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_right_4x4);\n\
-        right4      -= left4;\n\
-        float4 dst4  = right4 * x_lerp + left4;\n\
-        dst4         = dst4 * uint8Scale + output_ZP;\n\
+        float4 dst4  = right4 * x_lerp + left4 * one_minus_lerp + output_ZP;\n\
         int4 dst     = convert_int4_rte(dst4);\n\
 \n\
         VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
@@ -39770,6 +41782,103 @@ __kernel void scatter_nd_update_F16F16toU8_big(\n\
 }\n\
 "; /* end of scatter_nd_update_big_vx*/
 
+static const char scatter_nd_update_special_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Hi_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Hi_2x8;\n\
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
+\n\
+_viv_uniform int update_width;\n\
+_viv_uniform int output_width;\n\
+\n\
+_viv_uniform int offsetX;\n\
+_viv_uniform int offsetY;\n\
+_viv_uniform int offsetZ;\n\
+_viv_uniform int offsetW;\n\
+_viv_uniform int offset_idx;\n\
+\n\
+#define SCATTER_ND_UPDATE_REF2OUT_8BITS(src0_type, data_type) \\\n\
+__kernel void scatter_nd_update_ref2out_##src0_type##to##src0_type( \\\n\
+    __read_only image2d_t   input_ref, \\\n\
+    image2d_t  temp_ref, \\\n\
+    image2d_t  output0 \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    Image img0 = create_image_from_image2d(input_ref, 1); \\\n\
+    Image img1 = create_image_from_image2d(temp_ref, 1); \\\n\
+    __global data_type* in_ptr = (__global data_type*)img0.ptr; \\\n\
+    __global data_type* out_ptr = (__global data_type*)img1.ptr; \\\n\
+    data_type src, dst; \\\n\
+    src = in_ptr[gidx]; \\\n\
+    vxc_ushort8 mp0; \\\n\
+    _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst, src, mp0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift0_Hi_2x8); \\\n\
+    out_ptr[gidx] = dst; \\\n\
+}\n\
+SCATTER_ND_UPDATE_REF2OUT_8BITS(U8,  vxc_uchar16)\n\
+SCATTER_ND_UPDATE_REF2OUT_8BITS(I8,  vxc_char16)\n\
+\n\
+#define SCATTER_ND_UPDATE_UPDATE2REF_8BITS(src0_type, data_type) \\\n\
+__kernel void scatter_nd_update_update2ref_##src0_type##to##src0_type##_16x( \\\n\
+    __read_only image2d_t   input_index, \\\n\
+    __read_only image2d_t   input_update, \\\n\
+    image2d_t  temp_ref, \\\n\
+    image2d_t  input0, \\\n\
+    image2d_t  output1, \\\n\
+    int width, int area, int vol, int coord_dim \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(input_index, 4); \\\n\
+    Image img2 = create_image_from_image2d(input_update, 1); \\\n\
+    Image img3 = create_image_from_image2d(temp_ref, 1); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global data_type* update_ptr = (__global data_type*)img2.ptr; \\\n\
+    __global data_type* output_ptr = (__global data_type*)img3.ptr; \\\n\
+    data_type dst; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim + offset_idx); \\\n\
+    data_type src = update_ptr[gidy * update_width + gidx]; \\\n\
+    int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \\\n\
+    int loc = idx * output_width + gidx; \\\n\
+    vxc_ushort8 mp1; \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Lo_2x8); \\\n\
+    VXC_DP2x8(dst, src, mp1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift1_Hi_2x8); \\\n\
+    output_ptr[loc] = dst; \\\n\
+}\n\
+SCATTER_ND_UPDATE_UPDATE2REF_8BITS(U8,  vxc_uchar16)\n\
+SCATTER_ND_UPDATE_UPDATE2REF_8BITS(I8,  vxc_char16)\n\
+\n\
+#define SCATTER_ND_UPDATE_COPY2OUT(src0_type, data_type, element_size) \\\n\
+__kernel void scatter_nd_update_cpy2out_##src0_type##to##src0_type( \\\n\
+    __read_only image2d_t   temp_ref, \\\n\
+    image2d_t  input1, \\\n\
+    image2d_t  output \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    Image img0 = create_image_from_image2d(temp_ref, element_size); \\\n\
+    Image img1 = create_image_from_image2d(output, element_size); \\\n\
+    __global data_type* in_ptr = (__global data_type*)img0.ptr; \\\n\
+    __global data_type* out_ptr = (__global data_type*)img1.ptr; \\\n\
+    out_ptr[gidx] = in_ptr[gidx]; \\\n\
+}\n\
+SCATTER_ND_UPDATE_COPY2OUT(U8,  vxc_uchar16, 1)\n\
+SCATTER_ND_UPDATE_COPY2OUT(I8,  vxc_char16, 1)\n\
+"; /* end of scatter_nd_update_special_vx*/
+
 static const char select_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniConvConditiontoDst_2x8;\n\
@@ -43677,6 +45786,167 @@ __kernel void argmin_axis2_I32toI32_2D\n\
 \n\
 "; /* end of argmin_axis2_cl*/
 
+static const char avg_pool3d_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define TENSOR_AVG_POOL3D(src_name, dst_name, src_type, dst_type,\\\n\
+                         readimage_type, conv_mode, writeimage_type) \\\n\
+__kernel void avg_pool3d_##src_name##to##dst_name ( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              ksize_x, \\\n\
+                 int              ksize_y, \\\n\
+                 int              ksize_z, \\\n\
+                 int              stride_x, \\\n\
+                 int              stride_y, \\\n\
+                 int              stride_z, \\\n\
+                 int              pad_left, \\\n\
+                 int              pad_top, \\\n\
+                 int              pad_front, \\\n\
+                 int              width, \\\n\
+                 int              height, \\\n\
+                 int              depth_in, \\\n\
+                 int              depth_out, \\\n\
+                 float            inputScale, \\\n\
+                 float            inputTail, \\\n\
+                 float            outputScale, \\\n\
+                 float            outputTail, \\\n\
+                 int              count_include_pad) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int offsetz = get_global_id(2); \\\n\
+    int offsetz2 = offsetz / depth_out * depth_in; \\\n\
+    int d, d2, h, w, count; \\\n\
+    float sum = 0; \\\n\
+    dst_type out_data = (dst_type)(0); \\\n\
+    src_type in_data; \\\n\
+    float in_f32, out_f32; \\\n\
+    int wstart = gidx * stride_x - pad_left; \\\n\
+    int hstart = gidy * stride_y - pad_top; \\\n\
+    int wend = min(wstart + ksize_x, width); \\\n\
+    int hend = min(hstart + ksize_y, height); \\\n\
+    int dstart, dend; \\\n\
+    int4 coord_in, coord_out; \\\n\
+    wstart = max(wstart, 0); \\\n\
+    hstart = max(hstart, 0); \\\n\
+    for (d2 = 0; d2 < depth_out; d2++) \\\n\
+    { \\\n\
+        dstart = d2 * stride_z - pad_front; \\\n\
+        dend = min(dstart + ksize_z, depth_in); \\\n\
+        dstart = max(dstart, 0); \\\n\
+        coord_out = (int4)(gidx, gidy, offsetz + d2, 0); \\\n\
+        sum = 0; \\\n\
+        count = 0; \\\n\
+        for (d = dstart; d < dend; d++) \\\n\
+        { \\\n\
+            for (h = hstart; h < hend; h++) \\\n\
+            { \\\n\
+                for (w = wstart; w < wend; w++) \\\n\
+                { \\\n\
+                    coord_in = (int4)(w, h, d + offsetz2, 0); \\\n\
+                    in_data = readimage_type(input, coord_in).x; \\\n\
+                    in_f32 = convert_float(in_data) * inputScale + inputTail; \\\n\
+                    sum += in_f32; \\\n\
+                    count++; \\\n\
+                } \\\n\
+            } \\\n\
+        } \\\n\
+        if (count_include_pad == 1) \\\n\
+        { \\\n\
+            count = ksize_x * ksize_y * ksize_z; \\\n\
+        } \\\n\
+        out_f32 = (sum / count) * outputScale + outputTail; \\\n\
+        out_data.x = conv_mode(out_f32); \\\n\
+        writeimage_type(output, coord_out, out_data); \\\n\
+    } \\\n\
+}\n\
+\n\
+TENSOR_AVG_POOL3D(F32, F32, float, float4, read_imagef, convert_float, write_imagef)\n\
+TENSOR_AVG_POOL3D(F32, U32, float, uint4,  read_imagef, convert_uint,  write_imageui)\n\
+TENSOR_AVG_POOL3D(F32, I32, float, int4,   read_imagef, convert_int,   write_imagei)\n\
+\n\
+TENSOR_AVG_POOL3D(U32, U32, uint, uint4,  read_imageui, convert_uint,  write_imageui)\n\
+TENSOR_AVG_POOL3D(U32, F32, uint, float4, read_imageui, convert_float, write_imagef)\n\
+TENSOR_AVG_POOL3D(U32, I32, uint, int4,   read_imageui, convert_int,   write_imagei)\n\
+\n\
+TENSOR_AVG_POOL3D(I32, I32, int, int4,    read_imagei, convert_int,   write_imagei)\n\
+TENSOR_AVG_POOL3D(I32, F32, int, float4, read_imagei, convert_float, write_imagef)\n\
+TENSOR_AVG_POOL3D(I32, U32, int, uint4,  read_imagei, convert_uint,  write_imageui)\n\
+\n\
+__kernel void avg_pool3d_BF16toBF16 (\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+                 int              ksize_x,\n\
+                 int              ksize_y,\n\
+                 int              ksize_z,\n\
+                 int              stride_x,\n\
+                 int              stride_y,\n\
+                 int              stride_z,\n\
+                 int              pad_left,\n\
+                 int              pad_top,\n\
+                 int              pad_front,\n\
+                 int              width,\n\
+                 int              height,\n\
+                 int              depth_in,\n\
+                 int              depth_out,\n\
+                 float            inputScale,\n\
+                 float            inputTail,\n\
+                 float            outputScale,\n\
+                 float            outputTail,\n\
+                 int              count_include_pad)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int offsetz = get_global_id(2);\n\
+    int offsetz2 = offsetz / depth_out * depth_in;\n\
+    int d, d2, h, w, count;\n\
+    float sum = 0;\n\
+    uint4 out_data = (uint4)(0);\n\
+    uint4 in_data;\n\
+    float in_f32, out_f32;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int dstart, dend;\n\
+    int4 coord_in, coord_out;\n\
+    wstart = max(wstart, 0);\n\
+    hstart = max(hstart, 0);\n\
+    for (d2 = 0; d2 < depth_out; d2++)\n\
+    {\n\
+        dstart = d2 * stride_z - pad_front;\n\
+        dend = min(dstart + ksize_z, depth_in);\n\
+        dstart = max(dstart, 0);\n\
+        coord_out = (int4)(gidx, gidy, offsetz + d2, 0);\n\
+        sum = 0;\n\
+        count = 0;\n\
+        for (d = dstart; d < dend; d++)\n\
+        {\n\
+            for (h = hstart; h < hend; h++)\n\
+            {\n\
+                for (w = wstart; w < wend; w++)\n\
+                {\n\
+                    coord_in = (int4)(w, h, d + offsetz2, 0);\n\
+                    in_data = read_imageui(input, coord_in).x;\n\
+                    in_data = in_data << 16;\n\
+                    _viv_asm(COPY, in_f32, in_data, 16);\n\
+                    sum += in_f32;\n\
+                    count++;\n\
+                }\n\
+            }\n\
+        }\n\
+        if (count_include_pad == 1)\n\
+        {\n\
+            count = ksize_x * ksize_y * ksize_z;\n\
+        }\n\
+        out_f32 = sum / count;\n\
+        _viv_asm(COPY, out_data, out_f32, 4);\n\
+        out_data.x = out_data.x >> 16;\n\
+        write_imageui(output, coord_out, out_data);\n\
+    }\n\
+}"; /* end of avg_pool3d_cl*/
+
 static const char batchnorm_single_cl[] = "#define BN_U8_SAVE \\\n\
     uint4 dst = convert_uint4(src * output_scale + output_zp); \\\n\
     write_imageui(output, coord, dst);\n\
@@ -43897,6 +46167,118 @@ __kernel void batch_norm_I32to##TYPE##_2D \\\n\
 BATCH_NORM_I32_SH_IMPL_2D(I32)\n\
 BATCH_NORM_I32_SH_IMPL_2D(F32)"; /* end of batchnorm_single_cl*/
 
+static const char bilinear_grid_sample_cl[] = "__kernel void bilinear_grid_sample_F32_F32toF32(\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_t        input1,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  half_input0_w,\n\
+                           float  half_input0_h,\n\
+                           float  add_float_value_w,\n\
+                           float  add_float_value_h,\n\
+                           int    depth\n\
+                           )\n\
+{\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));\n\
+    int2   coord_add    = (int2)(-1, 1);\n\
+\n\
+    float fx = read_imagef(input1, coord_in1).x;\n\
+    coord_in1.x = coord_in1.x + 1;\n\
+    float fy = read_imagef(input1, coord_in1).x;\n\
+\n\
+    fx = fx * half_input0_w + add_float_value_w;\n\
+    fy = fy * half_input0_h + add_float_value_h;\n\
+    float x_f = floor(fx);\n\
+    float y_f = floor(fy);\n\
+    float x_lerp  = fx - x_f;\n\
+    float y_lerp  = fy - y_f;\n\
+    int   x_index = convert_int(x_f);\n\
+    int   y_index = convert_int(y_f);\n\
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);\n\
+\n\
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst;\n\
+\n\
+    while (coord_in.z < depth){\n\
+        top_l    = read_imagef(input0, coord_in);\n\
+        coord_in.y++;\n\
+        bottom_l = read_imagef(input0, coord_in);\n\
+        coord_in.x++;\n\
+        bottom_r = read_imagef(input0, coord_in);\n\
+        coord_in.y--;\n\
+        top_r    = read_imagef(input0, coord_in);\n\
+        top_r    = top_r - top_l;\n\
+        top      = top_l + x_lerp * top_r;\n\
+        bottom_r = bottom_r - bottom_l;\n\
+        bottom   = bottom_l + x_lerp * bottom_r;\n\
+        bottom   = bottom - top;\n\
+        dst      = top + y_lerp * bottom;\n\
+        write_imagef(output, coord_out, dst);\n\
+        coord_in.xz = coord_in.xz + coord_add;\n\
+        coord_out.z++;\n\
+    }\n\
+}\n\
+\n\
+\n\
+__kernel void bilinear_grid_sample_U8_U8toU8(\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_t        input1,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  half_input0_w,\n\
+                           float  half_input0_h,\n\
+                           float  add_float_value_w,\n\
+                           float  add_float_value_h,\n\
+                           int    depth,\n\
+                           float  in0_scale,\n\
+                           float  in0_tail,\n\
+                           float  in1_scale,\n\
+                           float  in1_tail,\n\
+                           float  out_scale,\n\
+                           float  out_tail\n\
+                           )\n\
+{\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));\n\
+    int2   coord_add    = (int2)(-1, 1);\n\
+\n\
+    float fx    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\
+    coord_in1.x = coord_in1.x + 1;\n\
+    float fy    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;\n\
+\n\
+    fx = fx * half_input0_w + add_float_value_w;\n\
+    fy = fy * half_input0_h + add_float_value_h;\n\
+    float x_f = floor(fx);\n\
+    float y_f = floor(fy);\n\
+    float x_lerp  = fx - x_f;\n\
+    float y_lerp  = fy - y_f;\n\
+    int   x_index = convert_int(x_f);\n\
+    int   y_index = convert_int(y_f);\n\
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);\n\
+\n\
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom;\n\
+    uint4  dst;\n\
+\n\
+    while (coord_in.z < depth){\n\
+        top_l    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+        coord_in.y++;\n\
+        bottom_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+        coord_in.x++;\n\
+        bottom_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+        coord_in.y--;\n\
+        top_r    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;\n\
+        top_r    = top_r - top_l;\n\
+        top      = top_l + x_lerp * top_r;\n\
+        bottom_r = bottom_r - bottom_l;\n\
+        bottom   = bottom_l + x_lerp * bottom_r;\n\
+        bottom   = bottom - top;\n\
+        top      = top + y_lerp * bottom;\n\
+        dst      = convert_uint4_rte(top * out_scale + out_tail);\n\
+        write_imageui(output, coord_out, dst);\n\
+        coord_in.xz = coord_in.xz + coord_add;\n\
+        coord_out.z++;\n\
+    }\n\
+\n\
+}"; /* end of bilinear_grid_sample_cl*/
+
 static const char bucketize_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\
 \n\
 #define BUCKETIZE_F32_2D_SH_IMPL(name, comp_op) \\\n\
@@ -44643,96 +47025,101 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\
     }\n\
 }\n\
 \n\
-__kernel void cumsum_U8toU8_axis2(\n\
-    __read_only image2d_array_t  input,\n\
-    __write_only image2d_array_t  output,\n\
-    int axis,\n\
-    int exclusive,\n\
-    int rev,\n\
-    int width,\n\
-    int height,\n\
-    int channel,\n\
-    int input_zp,\n\
-    float in_out_scale,\n\
-    float in_out_zp_scale,\n\
-    float output_zp\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 coord_out = coord;\n\
-\n\
-    uint4 sum = (uint4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
-\n\
-    float cnt = 0.0f;\n\
-\n\
-    if(exclusive && rev)\n\
-    {\n\
-        coord_out.z = channel - 1;\n\
-        write_imageui(output, coord_out, dst);\n\
-        for(coord.z = channel - 1; coord.z > 0; coord.z--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            coord_out.z--;\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord_out, dst);\n\
-        }\n\
-    }\n\
-    else if(exclusive)\n\
-    {\n\
-        coord_out.z = 0;\n\
-        write_imageui(output, coord_out, dst);\n\
-        for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            coord_out.z++;\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord_out, dst);\n\
-        }\n\
-    }\n\
-    else if(rev)\n\
-    {\n\
-        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord, dst);\n\
-        }\n\
-    }\n\
-    else\n\
-    {\n\
-        for(coord.z = 0; coord.z < channel; coord.z++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord, dst);\n\
-        }\n\
-    }\n\
+#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \\\n\
+__kernel void cumsum_##name##toU8_axis2( \\\n\
+    __read_only image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int channel, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    uint4 dst = (uint4)(0); \\\n\
+ \\\n\
+    float cnt = 0.0f; \\\n\
+ \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord_out.z = channel - 1; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+        for(coord.z = channel - 1; coord.z > 0; coord.z--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            coord_out.z--; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        coord_out.z = 0; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+        for(coord.z = 0; coord.z < channel - 1; coord.z++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            coord_out.z++; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
 }\n\
+CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)\n\
+CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)\n\
+\n\
+\n\
 \n\
 __kernel void cumsum_F32toF32_axis1(\n\
     __read_only image2d_array_t  input,\n\
@@ -44802,97 +47189,101 @@ __kernel void cumsum_F32toF32_axis1(\n\
     }\n\
 }\n\
 \n\
-__kernel void cumsum_U8toU8_axis1(\n\
-    __read_only image2d_array_t  input,\n\
-    __write_only image2d_array_t  output,\n\
-    int axis,\n\
-    int exclusive,\n\
-    int rev,\n\
-    int width,\n\
-    int height,\n\
-    int channel,\n\
-    int input_zp,\n\
-    float in_out_scale,\n\
-    float in_out_zp_scale,\n\
-    float output_zp\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 coord_out = coord;\n\
-\n\
-    uint4 sum = (uint4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
-\n\
-    float cnt = 0;\n\
-\n\
-    if(exclusive && rev)\n\
-    {\n\
-        coord_out.y = height - 1;\n\
-        write_imageui(output, coord_out, dst);\n\
-\n\
-        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            cnt += 1.0f;\n\
-            coord_out.y--;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord_out, dst);\n\
-        }\n\
-    }\n\
-    else if(exclusive)\n\
-    {\n\
-        coord_out.y = 0;\n\
-        write_imageui(output, coord_out, dst);\n\
-        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            cnt += 1.0f;\n\
-            coord_out.y++;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord_out, dst);\n\
-        }\n\
-    }\n\
-    else if(rev)\n\
-    {\n\
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord, dst);\n\
-        }\n\
-    }\n\
-    else\n\
-    {\n\
-        for(coord.y = 0; coord.y < height; coord.y++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord, dst);\n\
-        }\n\
-    }\n\
+#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \\\n\
+__kernel void cumsum_##name##toU8_axis1( \\\n\
+    __read_only image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int channel, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    uint4 dst = (uint4)(0); \\\n\
+ \\\n\
+    float cnt = 0; \\\n\
+ \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord_out.y = height - 1; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+ \\\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            coord_out.y--; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        coord_out.y = 0; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            coord_out.y++; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
 }\n\
+CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)\n\
+CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)\n\
+\n\
 \n\
 __kernel void cumsum_F32toF32_axis0(\n\
     __read_only image2d_array_t  input,\n\
@@ -44962,97 +47353,99 @@ __kernel void cumsum_F32toF32_axis0(\n\
     }\n\
 }\n\
 \n\
-__kernel void cumsum_U8toU8_axis0(\n\
-    __read_only image2d_array_t  input,\n\
-    __write_only image2d_array_t  output,\n\
-    int axis,\n\
-    int exclusive,\n\
-    int rev,\n\
-    int width,\n\
-    int height,\n\
-    int channel,\n\
-    int input_zp,\n\
-    float in_out_scale,\n\
-    float in_out_zp_scale,\n\
-    float output_zp\n\
-    )\n\
-{\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 coord_out = coord;\n\
-\n\
-    uint4 sum = (uint4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
-\n\
-    float cnt = 0;\n\
-\n\
-    if(exclusive && rev)\n\
-    {\n\
-        coord_out.x = width - 1;\n\
-        write_imageui(output, coord_out, dst);\n\
-        for(coord.x = width - 1; coord.x > 0; coord.x--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            coord_out.x--;\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord_out, dst);\n\
-        }\n\
-    }\n\
-    else if(exclusive)\n\
-    {\n\
-        coord_out.x = 0;\n\
-        write_imageui(output, coord_out, dst);\n\
-        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            coord_out.x++;\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord_out, dst);\n\
-        }\n\
-    }\n\
-    else if(rev)\n\
-    {\n\
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord, dst);\n\
-        }\n\
-    }\n\
-    else\n\
-    {\n\
-        for(coord.x = 0; coord.x < width; coord.x++)\n\
-        {\n\
-            uint4 data = read_imageui(input, coord);\n\
-            cnt += 1.0f;\n\
-            sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord, dst);\n\
-        }\n\
-    }\n\
+#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \\\n\
+__kernel void cumsum_##name##toU8_axis0( \\\n\
+    __read_only image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int channel, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    uint4 dst = (uint4)(0); \\\n\
+ \\\n\
+    float cnt = 0; \\\n\
+ \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord_out.x = width - 1; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+        for(coord.x = width - 1; coord.x > 0; coord.x--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            coord_out.x--; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        coord_out.x = 0; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            coord_out.x++; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.x = 0; coord.x < width; coord.x++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
 }\n\
-"; /* end of cumsum_cl*/
+CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)\n\
+CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)"; /* end of cumsum_cl*/
 
 static const char cumsum_2d_cl[] = "\n\
 __kernel void cumsum_F32toF32_axis1_2D(\n\
@@ -45210,6 +47603,95 @@ __kernel void cumsum_U8toU8_axis1_2D(\n\
     }\n\
 }\n\
 \n\
+__kernel void cumsum_F32toU8_axis1_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    float4 sum = (float4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    float cnt = 0;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.w = height - 1;\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.w--;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.w++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
 __kernel void cumsum_F32toF32_axis0_2D(\n\
     __read_only image2d_t  input,\n\
     __write_only image2d_t  output,\n\
@@ -45368,7 +47850,97 @@ __kernel void cumsum_U8toU8_axis0_2D(\n\
         }\n\
     }\n\
 }\n\
-"; /* end of cumsum_2d_cl*/
+\n\
+__kernel void cumsum_F32toU8_axis0_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    float4 sum = (float4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    float cnt = 0.0f;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.x = width - 1;\n\
+        coord.z = coord.x;\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(; coord.x > 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.z--;\n\
+            cnt += 1.0;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord.z = 0;\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.z++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+}"; /* end of cumsum_2d_cl*/
 
 static const char depth2space_crd_cl[] = "\n\
 __kernel void depth2space_crd_F32toF32(\n\
@@ -45748,6 +48320,32 @@ float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)\n\
     return val / (1.0f + fabs(val));\n\
 }\n\
 \n\
+float eltwise_unary_atan(float x, float alpha, float beta)\n\
+{\n\
+    return atan(x);\n\
+}\n\
+\n\
+float eltwise_unary_atanh(float x, float alpha, float beta)\n\
+{\n\
+    return atanh(x);\n\
+}\n\
+\n\
+float eltwise_unary_acosh(float x, float alpha, float beta)\n\
+{\n\
+    return acosh(x);\n\
+}\n\
+\n\
+float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)\n\
+{\n\
+    float x1, x2;\n\
+    x = clamp(x, 0, 1);\n\
+    x1 = x > alpha ? x : alpha;\n\
+    x2 = 1 - x;\n\
+    x2 = x2 > alpha ? x2 : alpha;\n\
+    return log(x1 / x2);\n\
+}\n\
+\n\
+\n\
 #define ELTWISE_UNARY_F32_2D(func_name) \\\n\
 __kernel void func_name##_F32toF32_2D \\\n\
     ( \\\n\
@@ -45785,6 +48383,10 @@ ELTWISE_UNARY_F32_2D(celu)\n\
 ELTWISE_UNARY_F32_2D(rcp)\n\
 ELTWISE_UNARY_F32_2D(sign)\n\
 ELTWISE_UNARY_F32_2D(softsign)\n\
+ELTWISE_UNARY_F32_2D(atan)\n\
+ELTWISE_UNARY_F32_2D(atanh)\n\
+ELTWISE_UNARY_F32_2D(acosh)\n\
+ELTWISE_UNARY_F32_2D(inverse_sigmoid)\n\
 \n\
 #define ELTWISE_UNARY_U8_2D(func_name) \\\n\
 __kernel void func_name##_U8toU8_2D \\\n\
@@ -45824,6 +48426,52 @@ ELTWISE_UNARY_U8_2D(celu)\n\
 ELTWISE_UNARY_U8_2D(rcp)\n\
 ELTWISE_UNARY_U8_2D(sign)\n\
 ELTWISE_UNARY_U8_2D(softsign)\n\
+ELTWISE_UNARY_U8_2D(atan)\n\
+ELTWISE_UNARY_U8_2D(atanh)\n\
+ELTWISE_UNARY_U8_2D(acosh)\n\
+ELTWISE_UNARY_U8_2D(inverse_sigmoid)\n\
+\n\
+#define ELTWISE_UNARY_U8toF32_2D(func_name) \\\n\
+__kernel void func_name##_U8toF32_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __write_only image2d_t output, \\\n\
+                 float     inputScale, \\\n\
+                 float     inputTail, \\\n\
+                 float     outputScale, \\\n\
+                 float     outputZP, \\\n\
+                 float     alpha, \\\n\
+                 float     beta \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    uint4 src = read_imageui(input, coord); \\\n\
+    float4 dst = convert_float4(src) * inputScale - inputTail; \\\n\
+ \\\n\
+    dst.x = eltwise_unary_##func_name(dst.x, alpha, beta); \\\n\
+ \\\n\
+    write_imagef(output, coord, dst); \\\n\
+}\n\
+ELTWISE_UNARY_U8toF32_2D(sin)\n\
+ELTWISE_UNARY_U8toF32_2D(cos)\n\
+ELTWISE_UNARY_U8toF32_2D(exp)\n\
+ELTWISE_UNARY_U8toF32_2D(log)\n\
+ELTWISE_UNARY_U8toF32_2D(neg)\n\
+ELTWISE_UNARY_U8toF32_2D(mish)\n\
+ELTWISE_UNARY_U8toF32_2D(hard_sigmoid)\n\
+ELTWISE_UNARY_U8toF32_2D(round)\n\
+ELTWISE_UNARY_U8toF32_2D(gelu)\n\
+ELTWISE_UNARY_U8toF32_2D(hard_gelu)\n\
+ELTWISE_UNARY_U8toF32_2D(selu)\n\
+ELTWISE_UNARY_U8toF32_2D(celu)\n\
+ELTWISE_UNARY_U8toF32_2D(rcp)\n\
+ELTWISE_UNARY_U8toF32_2D(sign)\n\
+ELTWISE_UNARY_U8toF32_2D(softsign)\n\
+ELTWISE_UNARY_U8toF32_2D(atan)\n\
+ELTWISE_UNARY_U8toF32_2D(atanh)\n\
+ELTWISE_UNARY_U8toF32_2D(acosh)\n\
+ELTWISE_UNARY_U8toF32_2D(inverse_sigmoid)\n\
 \n\
 __kernel void neg_I32toI32_2D\n\
     (\n\
@@ -45999,6 +48647,30 @@ float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)\n\
     return val / (1.0f + fabs(val));\n\
 }\n\
 \n\
+float eltwise_unary_atan(float x, float alpha, float beta)\n\
+{\n\
+    return atan(x);\n\
+}\n\
+\n\
+float eltwise_unary_atanh(float x, float alpha, float beta)\n\
+{\n\
+    return atanh(x);\n\
+}\n\
+float eltwise_unary_acosh(float x, float alpha, float beta)\n\
+{\n\
+    return acosh(x);\n\
+}\n\
+\n\
+float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)\n\
+{\n\
+    float x1, x2;\n\
+    x = clamp(x, 0, 1);\n\
+    x1 = x > alpha ? x : alpha;\n\
+    x2 = 1 - x;\n\
+    x2 = x2 > alpha ? x2 : alpha;\n\
+    return log(x1 / x2);\n\
+}\n\
+\n\
 #define ELTWISE_UNARY_F32(func_name) \\\n\
 __kernel void func_name##_F32toF32 \\\n\
     ( \\\n\
@@ -46036,6 +48708,10 @@ ELTWISE_UNARY_F32(celu)\n\
 ELTWISE_UNARY_F32(rcp)\n\
 ELTWISE_UNARY_F32(sign)\n\
 ELTWISE_UNARY_F32(softsign)\n\
+ELTWISE_UNARY_F32(atan)\n\
+ELTWISE_UNARY_F32(atanh)\n\
+ELTWISE_UNARY_F32(acosh)\n\
+ELTWISE_UNARY_F32(inverse_sigmoid)\n\
 \n\
 #define ELTWISE_UNARY_U8(func_name) \\\n\
 __kernel void func_name##_U8toU8 \\\n\
@@ -46075,6 +48751,52 @@ ELTWISE_UNARY_U8(celu)\n\
 ELTWISE_UNARY_U8(rcp)\n\
 ELTWISE_UNARY_U8(sign)\n\
 ELTWISE_UNARY_U8(softsign)\n\
+ELTWISE_UNARY_U8(atan)\n\
+ELTWISE_UNARY_U8(atanh)\n\
+ELTWISE_UNARY_U8(acosh)\n\
+ELTWISE_UNARY_U8(inverse_sigmoid)\n\
+\n\
+#define ELTWISE_UNARY_U8toF32(func_name) \\\n\
+__kernel void func_name##_U8toF32 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           inputScale, \\\n\
+                 float           inputTail, \\\n\
+                 float           outputScale, \\\n\
+                 float           outputZP, \\\n\
+                 float           alpha, \\\n\
+                 float           beta \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    uint4 src = read_imageui(input, coord); \\\n\
+    float4 dst = convert_float4(src) * inputScale - inputTail; \\\n\
+ \\\n\
+    dst.x = eltwise_unary_##func_name(dst.x, alpha, beta); \\\n\
+ \\\n\
+    write_imagef(output, coord, dst); \\\n\
+}\n\
+ELTWISE_UNARY_U8toF32(sin)\n\
+ELTWISE_UNARY_U8toF32(cos)\n\
+ELTWISE_UNARY_U8toF32(exp)\n\
+ELTWISE_UNARY_U8toF32(log)\n\
+ELTWISE_UNARY_U8toF32(neg)\n\
+ELTWISE_UNARY_U8toF32(mish)\n\
+ELTWISE_UNARY_U8toF32(hard_sigmoid)\n\
+ELTWISE_UNARY_U8toF32(round)\n\
+ELTWISE_UNARY_U8toF32(gelu)\n\
+ELTWISE_UNARY_U8toF32(hard_gelu)\n\
+ELTWISE_UNARY_U8toF32(selu)\n\
+ELTWISE_UNARY_U8toF32(celu)\n\
+ELTWISE_UNARY_U8toF32(rcp)\n\
+ELTWISE_UNARY_U8toF32(sign)\n\
+ELTWISE_UNARY_U8toF32(softsign)\n\
+ELTWISE_UNARY_U8toF32(atan)\n\
+ELTWISE_UNARY_U8toF32(atanh)\n\
+ELTWISE_UNARY_U8toF32(acosh)\n\
+ELTWISE_UNARY_U8toF32(inverse_sigmoid)\n\
 \n\
 __kernel void neg_I32toI32\n\
     (\n\
@@ -46566,6 +49288,119 @@ __kernel void gather_F32toF32(\n\
 }\n\
 "; /* end of gather_cl*/
 
+static const char gather_array_cl[] = "__kernel void gather_array_U8toU8(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num,\n\
+    int indices_num,\n\
+    int batch\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
+    coord_in.w = gidz * axis_num + indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 1);\n\
+    Image img2 = create_image_from_image2d(output, 1);\n\
+    __global uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
+    uchar data = input_ptr[0];\n\
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+    __global uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    output_ptr[0] = data;\n\
+}\n\
+\n\
+__kernel void gather_array_F16toF16(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num,\n\
+    int indices_num,\n\
+    int batch\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
+    coord_in.w = gidz * axis_num + indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    __global short* input_ptr = (__global short*)get_image_ptr_from_coord(img1, coord_in.zw);\n\
+    short data = input_ptr[0];\n\
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+    __global short* output_ptr = (__global short*)get_image_ptr_from_coord(img2, coord);\n\
+    output_ptr[0] = data;\n\
+}\n\
+\n\
+__kernel void gather_array_I32toI32(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num,\n\
+    int indices_num,\n\
+    int batch\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
+    coord_in.w = gidz * axis_num + indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 4);\n\
+    Image img2 = create_image_from_image2d(output, 4);\n\
+    __global int* input_ptr = (__global int*)get_image_ptr_from_coord(img1, coord_in.zw);\n\
+    int data = input_ptr[0];\n\
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+    __global int* output_ptr = (__global int*)get_image_ptr_from_coord(img2, coord);\n\
+    output_ptr[0] = data;\n\
+}\n\
+\n\
+__kernel void gather_array_F32toF32(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num,\n\
+    int indices_num,\n\
+    int batch\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
+    coord_in.w = gidz * axis_num + indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 4);\n\
+    Image img2 = create_image_from_image2d(output, 4);\n\
+    __global float* input_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_in.zw);\n\
+    float data = input_ptr[0];\n\
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+    __global float* output_ptr = (__global float*)get_image_ptr_from_coord(img2, coord);\n\
+    output_ptr[0] = data;\n\
+}\n\
+"; /* end of gather_array_cl*/
+
 static const char gather_batch_cl[] = "__kernel void gather_batch_U8toU8(\n\
     __read_only image2d_array_t   input0,\n\
     __read_only image2d_t   input1,\n\
@@ -47082,6 +49917,223 @@ __kernel void gather_nd_F32toF32_3D(\n\
 }\n\
 "; /* end of gather_nd_3d_cl*/
 
+static const char gather_nd_batch_cl[] = "__kernel void gather_nd_batch_U8toU8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch_num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
+    int4 indice = read_imagei(input1, coord.wy);\n\
+    coord.z = indice.x * block_size + gidx;\n\
+\n\
+    uint4 data = read_imageui(input0, coord.zy);\n\
+    write_imageui(output, coord.xy, data);\n\
+}\n\
+\n\
+__kernel void gather_nd_batch_F16toF16_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch_num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
+    int4 indice = read_imagei(input1, coord.wy);\n\
+    coord.z = indice.x * block_size + gidx;\n\
+\n\
+    float4 data = read_imagef(input0, coord.zy);\n\
+    write_imagef(output, coord.xy, data);\n\
+}\n\
+\n\
+__kernel void gather_nd_batch_I8toI8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch_num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, 0, 0);\n\
+    int4 indice = read_imagei(input1, coord.wy);\n\
+    coord.z = indice.x * block_size + gidx;\n\
+\n\
+    int4 data = read_imagei(input0, coord.zy);\n\
+    write_imagei(output, coord.xy, data);\n\
+}\n\
+\n\
+//2D\n\
+__kernel void gather_nd_batch_U8toU8_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 1);\n\
+    int4 indice = read_imagei(input1, coord.xy);\n\
+    int4 indice1 = read_imagei(input1, coord.wy);\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.y = indice1.x;\n\
+    indice.zw = coord.yx;\n\
+\n\
+    uint4 data = read_imageui(input0, indice);\n\
+    write_imageui(output, coord.zy, data);\n\
+}\n\
+\n\
+__kernel void gather_nd_batch_F16toF16_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 1);\n\
+    int4 indice = read_imagei(input1, coord.xy);\n\
+    int4 indice1 = read_imagei(input1, coord.wy);\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.y = indice1.x;\n\
+    indice.zw = coord.yx;\n\
+\n\
+    float4 data = read_imagef(input0, indice);\n\
+    write_imagef(output, coord.zy, data);\n\
+}\n\
+\n\
+__kernel void gather_nd_batch_I8toI8_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // batch_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 1);\n\
+    int4 indice = read_imagei(input1, coord.xy);\n\
+    int4 indice1 = read_imagei(input1, coord.wy);\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.y = indice1.x;\n\
+    indice.y = indice1.x;\n\
+    indice.zw = coord.yx;\n\
+\n\
+    int4 data = read_imagei(input0, indice);\n\
+    write_imagei(output, coord.zy, data);\n\
+}\n\
+"; /* end of gather_nd_batch_cl*/
+
+static const char globallppool_cl[] = "\n\
+#define GLOBALLPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type) \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int4 coord_out = (int4)(0, 0, gidx, 0); \\\n\
+    int4 coord_in  = coord_out; \\\n\
+    int h, w; \\\n\
+    float sum_of_pow = 0; \\\n\
+    dst_type out_data = (dst_type)(0); \\\n\
+    src_type in_data; \\\n\
+    float in_f32, out_f32; \\\n\
+    for (h = 0; h < height; h++) \\\n\
+    { \\\n\
+        for (w = 0; w < width; w++) \\\n\
+        { \\\n\
+            coord_in.xy = (int2)(w, h); \\\n\
+            in_data = readimage_type(input, coord_in).x; \\\n\
+            in_f32 = convert_float(in_data) * inputScale + inputTail; \\\n\
+            sum_of_pow += pow(fabs(in_f32),p); \\\n\
+        } \\\n\
+    } \\\n\
+    out_f32 = pow(sum_of_pow, 1.0f / p) * outputScale + outputTail; \\\n\
+    out_data.x = conv_mode(out_f32); \\\n\
+    writeimage_type(output, coord_out, out_data); \\\n\
+\n\
+#define TENSOR_GLOBALLPPOOL(src_name, dst_name, src_type, dst_type, readimage_type, conv_mode, writeimage_type) \\\n\
+__kernel void globallppool_##src_name##to##dst_name ( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              p, \\\n\
+                 int              width, \\\n\
+                 int              height, \\\n\
+                 float            inputScale, \\\n\
+                 float            inputTail, \\\n\
+                 float            outputScale, \\\n\
+                 float            outputTail) \\\n\
+{ \\\n\
+    GLOBALLPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type); \\\n\
+}\n\
+\n\
+TENSOR_GLOBALLPPOOL(F32, F32, float, float4, read_imagef, convert_float, write_imagef)\n\
+TENSOR_GLOBALLPPOOL(F32, U32, float, uint4,  read_imagef, convert_uint,  write_imageui)\n\
+TENSOR_GLOBALLPPOOL(F32, I32, float, int4,   read_imagef, convert_int,   write_imagei)\n\
+\n\
+TENSOR_GLOBALLPPOOL(U32, U32, uint, uint4,  read_imageui, convert_uint,  write_imageui)\n\
+TENSOR_GLOBALLPPOOL(U32, F32, uint, float4, read_imageui, convert_float, write_imagef)\n\
+TENSOR_GLOBALLPPOOL(U32, I32, uint, int4,   read_imageui, convert_int,   write_imagei)\n\
+\n\
+TENSOR_GLOBALLPPOOL(I32, I32, int, int4,    read_imagei, convert_int,   write_imagei)\n\
+TENSOR_GLOBALLPPOOL(I32, F32, int, float4, read_imagei, convert_float, write_imagef)\n\
+TENSOR_GLOBALLPPOOL(I32, U32, int, uint4,  read_imagei, convert_uint,  write_imageui)\n\
+\n\
+__kernel void globallppool_BF16toBF16(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+                 int              p,\n\
+                 int              width,\n\
+                 int              height,\n\
+                 float            inputScale,\n\
+                 float            inputTail,\n\
+                 float            outputScale,\n\
+                 float            outputTail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int4 coord_out = (int4)(1, 1, gidx , 0);\n\
+    int4 coord_in  = coord_out;\n\
+    int h, w;\n\
+    float sum_of_pow = 0;\n\
+    float out_data_f32 = 0;\n\
+    uint4 dst = (uint4)(0);\n\
+    float4 data_f32 = (float4)(0);\n\
+    uint4 data;\n\
+\n\
+    for (h = 0; h < height; h++)\n\
+    {\n\
+        for (w = 0; w < width; w++)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            data = read_imageui(input, coord_in);\n\
+            data = data << 16;\n\
+            _viv_asm(COPY, data_f32, data, 16);\n\
+            sum_of_pow += pow(abs(data_f32.x),p);\n\
+        }\n\
+    }\n\
+    out_data_f32 = pow(sum_of_pow, 1.0f / p);\n\
+    _viv_asm(COPY, dst, out_data_f32, 4);\n\
+    dst.x = dst.x >> 16;\n\
+    write_imageui(output, coord_out, dst);\n\
+}\n\
+\n\
+"; /* end of globallppool_cl*/
+
 static const char group_normalization_f32_cl[] = "__kernel void group_norm_sumsqr_F32(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_t  output,\n\
@@ -48611,7 +51663,6 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm
     int4 coord = (int4)(gidx, 0, gidz, 0);\n\
     int4 data;\n\
     float2 sum_x_x2 = 0;\n\
-    int2 _sum_x_x2 = 0;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
@@ -48622,10 +51673,10 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm
         {\n\
             data = read_imagei(input, coord);\n\
             coord.y++;\n\
-            _sum_x_x2.x = _sum_x_x2.x + data.x;\n\
-            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\
+            float in = convert_float(data.x);\n\
+            sum_x_x2.x = sum_x_x2.x + in;\n\
+            sum_x_x2.y = sum_x_x2.y + in * in;\n\
         }\n\
-        sum_x_x2 = convert_float2(_sum_x_x2);\n\
     }\n\
     lcl_sum[lidx] = sum_x_x2.x;\n\
     lcl_sqr[lidx] = sum_x_x2.y;\n\
@@ -48671,7 +51722,6 @@ __kernel void instance_norm_sums_I32_2D(\n\
     int2 coord = (int2)(gidx, gidy);\n\
     int4 data;\n\
     float2 sum_x_x2 = 0;\n\
-    int2 _sum_x_x2 = 0;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
@@ -48683,10 +51733,10 @@ __kernel void instance_norm_sums_I32_2D(\n\
         {\n\
             data = read_imagei(input, coord);\n\
             coord.y++;\n\
-            _sum_x_x2.x = _sum_x_x2.x + data.x;\n\
-            _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\
+            float in = convert_float(data.x);\n\
+            sum_x_x2.x = sum_x_x2.x + in;\n\
+            sum_x_x2.y = sum_x_x2.y + in * in;\n\
         }\n\
-        sum_x_x2 = convert_float2(_sum_x_x2);\n\
     }\n\
     lcl_sum[lidx] = sum_x_x2.x;\n\
     lcl_sqr[lidx] = sum_x_x2.y;\n\
@@ -49257,6 +52307,272 @@ __kernel void instance_norm_U8toF16_2D(\n\
 }\n\
 "; /* end of instance_normalization_u8_cl*/
 
+static const char l1norm_cl[] = "#define eps 1e-12\n\
+\n\
+#define TENSOR_L1NORM_axis0(src_name, dst_name, src_type, dst_type, \\\n\
+                      readimage_type, conv_mode, writeimage_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l1norm_##src_name##to##dst_name##_axis0( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int lidx = get_local_id(0); \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    float4 src_f, dst_f; \\\n\
+    float sum = 0; \\\n\
+    float rcp_sum = 0; \\\n\
+    int4 coord= (int4)(gidx, gidy, gidz, 0); \\\n\
+    __local float lcl_sum[16]; \\\n\
+    for (; coord.x < axis_size; coord.x += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        sum += fabs(src_f.x); \\\n\
+    } \\\n\
+    lcl_sum[lidx] = sum; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\
+    float4 one = (float4)(1, 1, 1, 1); \\\n\
+    float4 data0; \\\n\
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\
+    rcp_sum =  1 / (dot(data0, one) + eps); \\\n\
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        dst_f = src_f * rcp_sum; \\\n\
+        dst = conv_mode(dst_f * outputscale + outputtail); \\\n\
+        writeimage_type(output, coord, dst); \\\n\
+    } \\\n\
+}\n\
+\n\
+TENSOR_L1NORM_axis0(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)\n\
+TENSOR_L1NORM_axis0(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_axis0(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_axis0(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_axis0(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_axis0(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)\n\
+TENSOR_L1NORM_axis0(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)\n\
+\n\
+#define TENSOR_L1NORM_axis1(src_name, dst_name, src_type, dst_type, \\\n\
+                      readimage_type, conv_mode, writeimage_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l1norm_##src_name##to##dst_name##_axis1( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int lidy = get_local_id(1); \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    float4 src_f, dst_f; \\\n\
+    float sum = 0; \\\n\
+    float rcp_sum = 0; \\\n\
+    int4 coord= (int4)(gidx, gidy, gidz, 0); \\\n\
+    __local float lcl_sum[16]; \\\n\
+    for (; coord.y < axis_size; coord.y += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        sum += fabs(src_f.x); \\\n\
+    } \\\n\
+    lcl_sum[lidy] = sum; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\
+    float4 one = (float4)(1, 1, 1, 1); \\\n\
+    float4 data0; \\\n\
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\
+    rcp_sum =  1 / (dot(data0, one) + eps); \\\n\
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        dst_f = src_f * rcp_sum; \\\n\
+        dst = conv_mode(dst_f * outputscale + outputtail); \\\n\
+        writeimage_type(output, coord, dst); \\\n\
+    } \\\n\
+}\n\
+\n\
+TENSOR_L1NORM_axis1(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)\n\
+TENSOR_L1NORM_axis1(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_axis1(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_axis1(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_axis1(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_axis1(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)\n\
+TENSOR_L1NORM_axis1(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)\n\
+\n\
+#define TENSOR_L1NORM_axis2(src_name, dst_name, src_type, dst_type, \\\n\
+                      readimage_type, conv_mode, writeimage_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(1, 1, 16))) void l1norm_##src_name##to##dst_name##_axis2( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int lidz = get_local_id(2); \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    float4 src_f, dst_f; \\\n\
+    float sum = 0; \\\n\
+    float rcp_sum = 0; \\\n\
+    int4 coord= (int4)(gidx, gidy, gidz, 0); \\\n\
+    __local float lcl_sum[16]; \\\n\
+    for (; coord.z < axis_size; coord.z += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        sum += fabs(src_f.x); \\\n\
+    } \\\n\
+    lcl_sum[lidz] = sum; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\
+    float4 one = (float4)(1, 1, 1, 1); \\\n\
+    float4 data0; \\\n\
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\
+    rcp_sum =  1 / (dot(data0, one) + eps); \\\n\
+    for (coord.z = gidz; coord.z < axis_size; coord.z += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        dst_f = src_f * rcp_sum; \\\n\
+        dst = conv_mode(dst_f * outputscale + outputtail); \\\n\
+        writeimage_type(output, coord, dst); \\\n\
+    } \\\n\
+}\n\
+\n\
+TENSOR_L1NORM_axis2(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)\n\
+TENSOR_L1NORM_axis2(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_axis2(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_axis2(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_axis2(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_axis2(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)\n\
+TENSOR_L1NORM_axis2(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)\n\
+\n\
+#define TENSOR_L1NORM_2D_axis0(src_name, dst_name, src_type, dst_type,\\\n\
+          readimage_type, conv_mode, writeimage_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l1norm_##src_name##to##dst_name##_2D_axis0( \\\n\
+    __read_only  image2d_t        input, \\\n\
+    __write_only image2d_t        output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int lidx = get_local_id(0); \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    float4 src_f, dst_f; \\\n\
+    float sum = 0; \\\n\
+    float rcp_sum = 0; \\\n\
+    int2 coord = (int2)(gidx, gidy); \\\n\
+    __local float lcl_sum[16]; \\\n\
+    for (; coord.x < axis_size; coord.x += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        sum += fabs(src_f.x); \\\n\
+    } \\\n\
+    lcl_sum[lidx] = sum; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\
+    float4 one = (float4)(1, 1, 1, 1); \\\n\
+    float4 data0; \\\n\
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\
+    rcp_sum = 1 / (dot(data0, one) + eps); \\\n\
+    for (coord.x = gidx; coord.x < axis_size; coord.x += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        dst_f = src_f * rcp_sum; \\\n\
+        dst = conv_mode(dst_f * outputscale + outputtail); \\\n\
+        writeimage_type(output, coord, dst); \\\n\
+    } \\\n\
+}\n\
+\n\
+TENSOR_L1NORM_2D_axis0(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)\n\
+TENSOR_L1NORM_2D_axis0(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_2D_axis0(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_2D_axis0(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_2D_axis0(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_2D_axis0(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)\n\
+TENSOR_L1NORM_2D_axis0(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)\n\
+\n\
+\n\
+#define TENSOR_L1NORM_2D_axis1(src_name, dst_name, src_type, dst_type,\\\n\
+             readimage_type, conv_mode, writeimage_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l1norm_##src_name##to##dst_name##_2D_axis1( \\\n\
+    __read_only  image2d_t        input, \\\n\
+    __write_only image2d_t        output, \\\n\
+                 float            inputZp, \\\n\
+                 float            outputscale, \\\n\
+                 float            outputtail, \\\n\
+                 int              axis, \\\n\
+                 int              axis_size) \\\n\
+{ \\\n\
+    int lidy = get_local_id(1); \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    float4 src_f, dst_f; \\\n\
+    float sum = 0; \\\n\
+    float rcp_sum = 0; \\\n\
+    int2 coord = (int2)(gidx, gidy); \\\n\
+    __local float lcl_sum[16]; \\\n\
+    for (; coord.y < axis_size; coord.y += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        sum += fabs(src_f.x); \\\n\
+    } \\\n\
+    lcl_sum[lidy] = sum; \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+    float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\
+    float4 one = (float4)(1, 1, 1, 1); \\\n\
+    float4 data0; \\\n\
+    data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\
+    rcp_sum = 1 / (dot(data0, one) + eps); \\\n\
+    for (coord.y = gidy; coord.y < axis_size; coord.y += 16) \\\n\
+    { \\\n\
+        src = readimage_type(input, coord); \\\n\
+        src_f = convert_float4(src) - inputZp; \\\n\
+        dst_f = src_f * rcp_sum; \\\n\
+        dst = conv_mode(dst_f * outputscale + outputtail); \\\n\
+        writeimage_type(output, coord, dst); \\\n\
+    } \\\n\
+}\n\
+\n\
+TENSOR_L1NORM_2D_axis1(F32,F32,float4,float4,read_imagef, convert_float4,write_imagef)\n\
+TENSOR_L1NORM_2D_axis1(U32,U32,uint4, uint4, read_imageui,convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_2D_axis1(I32,I32,int4,  int4,  read_imagei, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_2D_axis1(F32,U32,float4,uint4, read_imagef, convert_uint4, write_imageui)\n\
+TENSOR_L1NORM_2D_axis1(F32,I32,float4,int4,  read_imagef, convert_int4,  write_imagei)\n\
+TENSOR_L1NORM_2D_axis1(U32,F32,uint4, float4,read_imageui,convert_float4,write_imagef)\n\
+TENSOR_L1NORM_2D_axis1(I32,F32,int4,  float4,read_imagei, convert_float4,write_imagef)"; /* end of l1norm_cl*/
+
 static const char l2normalizescale_axis0_cl[] = "\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_axis0_F32_F32toF32_2D(\n\
     __read_only  image2d_t input,\n\
@@ -49293,6 +52609,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
     for (coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\
     {\n\
+        coord_scale.x = coord.x;\n\
         src         = read_imagef(input, coord);\n\
         scale_value = read_imagef(scale, coord_scale);\n\
         result      = src * rsqrt_sum * scale_value;\n\
@@ -49335,6 +52652,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
     for (coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\
     {\n\
+        coord_scale.x = coord.x;\n\
         src         = convert_float4(read_imageui(input, coord))  * inputScale + inputTail;\n\
         scale_value = read_imagef(scale, coord_scale);\n\
         result      = src * rsqrt_sum * scale_value;\n\
@@ -49378,6 +52696,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
     for (coord.x = gidx; coord.x < axis_size; coord.x += 16)\n\
     {\n\
+        coord_scale.x = coord.x;\n\
         src         = convert_float4(read_imagei(input, coord))  * inputScale + inputTail;\n\
         scale_value = read_imagef(scale, coord_scale);\n\
         result      = src * rsqrt_sum * scale_value;\n\
@@ -49423,6 +52742,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
     for (coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\
     {\n\
+        coord_scale.x = coord.y;\n\
         src         = read_imagef(input, coord);\n\
         scale_value = read_imagef(scale, coord_scale);\n\
         result      = src * rsqrt_sum * scale_value;\n\
@@ -49465,6 +52785,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
     for (coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\
     {\n\
+        coord_scale.x = coord.y;\n\
         src         = convert_float4(read_imageui(input, coord))  * inputScale + inputTail;\n\
         scale_value = read_imagef(scale, coord_scale);\n\
         result      = src * rsqrt_sum * scale_value;\n\
@@ -49508,6 +52829,7 @@ __kernel __attribute__((reqd_work_group_size(1, 16, 1))) void l2normalizescale_a
     rsqrt_sum = (sum == 0 ? rsEps : rsqrt(sum));\n\
     for (coord.y = gidy; coord.y < axis_size; coord.y += 16)\n\
     {\n\
+        coord_scale.x = coord.y;\n\
         src         = convert_float4(read_imagei(input, coord)) * inputScale + inputTail;\n\
         scale_value = read_imagef(scale, coord_scale);\n\
         result      = src * rsqrt_sum * scale_value;\n\
@@ -53534,6 +56856,193 @@ __kernel void gemm_transb_F32I8toF32_3D(\n\
     coord_a.z = get_global_id(2);\n\
     write_imagef(output, coord_a, sum);\n\
 }\n\
+\n\
+#define GEMM_2D(name, dst_type, read_image_type, convert_type, write_image_type) \\\n\
+__kernel void gemm_##name##_2D( \\\n\
+    __read_only image2d_t   inputA, \\\n\
+    __read_only image2d_t   inputB, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int M, \\\n\
+    int K, \\\n\
+    int N, \\\n\
+    int ac2zero, \\\n\
+    int bc2zero, \\\n\
+    float scale_a, \\\n\
+    float zp_a, \\\n\
+    float scale_b, \\\n\
+    float zp_b, \\\n\
+    float scale_out, \\\n\
+    float zp_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+    float4 sum = (float4)(0); \\\n\
+    dst_type dst; \\\n\
+\\\n\
+    for(; coord.z < K;) \\\n\
+    { \\\n\
+        float4 tempA0; \\\n\
+        float4 tempB0; \\\n\
+        tempA0 = convert_float4(read_image_type(inputA, coord.zy)); \\\n\
+        tempB0 = convert_float4(read_image_type(inputB, coord.xz)); \\\n\
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \\\n\
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \\\n\
+        coord.z++; \\\n\
+        sum = sum + tempA0 * tempB0; \\\n\
+    } \\\n\
+    sum.x = sum.x * scale_out + zp_out; \\\n\
+    dst = convert_type(sum); \\\n\
+ \\\n\
+    write_image_type(output, coord.xy, dst); \\\n\
+}\n\
+GEMM_2D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);\n\
+GEMM_2D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);\n\
+GEMM_2D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
+\n\
+\n\
+#define GEMM_3D(name, dst_type, read_image_type, convert_type, write_image_type) \\\n\
+__kernel void gemm_##name##_3D( \\\n\
+    __read_only image2d_array_t   inputA, \\\n\
+    __read_only image2d_array_t   inputB, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int M, \\\n\
+    int K, \\\n\
+    int N, \\\n\
+    int ac2zero, \\\n\
+    int bc2zero, \\\n\
+    float scale_a, \\\n\
+    float zp_a, \\\n\
+    float scale_b, \\\n\
+    float zp_b, \\\n\
+    float scale_out, \\\n\
+    float zp_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); \\\n\
+    int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\
+    float4 sum = (float4)(0); \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(; coord_a.x < K;) \\\n\
+    { \\\n\
+        float4 tempA0; \\\n\
+        float4 tempB0; \\\n\
+ \\\n\
+        tempA0 = convert_float4(read_image_type(inputA, coord_a)); \\\n\
+        tempB0 = convert_float4(read_image_type(inputB, coord_b)); \\\n\
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \\\n\
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \\\n\
+ \\\n\
+        coord_a.x++; \\\n\
+        coord_b.y++; \\\n\
+ \\\n\
+        sum = sum + tempA0 * tempB0; \\\n\
+    } \\\n\
+    sum.x = sum.x * scale_out + zp_out; \\\n\
+    dst = convert_type(sum); \\\n\
+ \\\n\
+    coord_b.y = get_global_id(1); \\\n\
+    coord_b.z = get_global_id(2); \\\n\
+    write_image_type(output, coord_b, dst); \\\n\
+}\n\
+GEMM_3D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);\n\
+GEMM_3D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);\n\
+GEMM_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
+\n\
+#define GEMM_TRANSB_2D(name, dst_type, read_image_type, convert_type, write_image_type) \\\n\
+__kernel void gemm_transb_##name##_2D( \\\n\
+    __read_only image2d_t   inputA, \\\n\
+    __read_only image2d_t   inputB, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int M, \\\n\
+    int K, \\\n\
+    int N, \\\n\
+    int ac2zero, \\\n\
+    int bc2zero, \\\n\
+    float scale_a, \\\n\
+    float zp_a, \\\n\
+    float scale_b, \\\n\
+    float zp_b, \\\n\
+    float scale_out, \\\n\
+    float zp_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+    float4 sum = (float4)(0); \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(; coord.z < K;) \\\n\
+    { \\\n\
+        float4 tempA0; \\\n\
+        float4 tempB0; \\\n\
+ \\\n\
+        tempA0 = convert_float4(read_image_type(inputA, coord.zy)); \\\n\
+        tempB0 = convert_float4(read_image_type(inputB, coord.zx)); \\\n\
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \\\n\
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \\\n\
+        coord.z++; \\\n\
+ \\\n\
+        sum = sum + tempA0 * tempB0; \\\n\
+    } \\\n\
+    sum.x = sum.x * scale_out + zp_out; \\\n\
+    dst = convert_type(sum); \\\n\
+ \\\n\
+    write_image_type(output, coord.xy, dst); \\\n\
+}\n\
+GEMM_TRANSB_2D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);\n\
+GEMM_TRANSB_2D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);\n\
+GEMM_TRANSB_2D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
+\n\
+\n\
+#define GEMM_TRANSB_3D(name, dst_type, read_image_type, convert_type, write_image_type) \\\n\
+__kernel void gemm_transb_##name##_3D( \\\n\
+    __read_only image2d_array_t   inputA, \\\n\
+    __read_only image2d_array_t   inputB, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int M, \\\n\
+    int K, \\\n\
+    int N, \\\n\
+    int ac2zero, \\\n\
+    int bc2zero, \\\n\
+    float scale_a, \\\n\
+    float zp_a, \\\n\
+    float scale_b, \\\n\
+    float zp_b, \\\n\
+    float scale_out, \\\n\
+    float zp_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord_a = (int4)(0, get_global_id(1), (ac2zero ? 0 : get_global_id(2)), 0); \\\n\
+    int4 coord_b = (int4)(0, get_global_id(0), (bc2zero ? 0 : get_global_id(2)), 0); \\\n\
+ \\\n\
+    float4 sum = (float4)(0); \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(; coord_a.x < K;) \\\n\
+    { \\\n\
+        float4 tempA0; \\\n\
+        float4 tempB0; \\\n\
+ \\\n\
+        tempA0 = convert_float4(read_image_type(inputA, coord_a)); \\\n\
+        tempB0 = convert_float4(read_image_type(inputB, coord_b)); \\\n\
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \\\n\
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \\\n\
+        coord_a.x++; \\\n\
+        coord_b.x++; \\\n\
+ \\\n\
+        sum = sum + tempA0 * tempB0; \\\n\
+    } \\\n\
+    sum.x = sum.x * scale_out + zp_out; \\\n\
+    dst = convert_type(sum); \\\n\
+ \\\n\
+    coord_a.x = get_global_id(0); \\\n\
+    coord_a.z = get_global_id(2); \\\n\
+    write_image_type(output, coord_a, dst); \\\n\
+}\n\
+GEMM_TRANSB_3D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);\n\
+GEMM_TRANSB_3D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);\n\
+GEMM_TRANSB_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
+\n\
 "; /* end of matrixmul_cl*/
 
 static const char matrixmul_transA_cl[] = "__kernel void gemm_transa_F32F32toF32_2D(\n\
@@ -53612,6 +57121,102 @@ __kernel void gemm_transa_F32F32toF32_3D(\n\
     coord_b.z = get_global_id(2);\n\
     write_imagef(output, coord_b, sum);\n\
 }\n\
+\n\
+#define GEMM_TRANSA_2D(name, dst_type, read_image_type, convert_type, write_image_type) \\\n\
+__kernel void gemm_transa_##name##_2D( \\\n\
+    __read_only image2d_t   inputA, \\\n\
+    __read_only image2d_t   inputB, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int M, \\\n\
+    int K, \\\n\
+    int N, \\\n\
+    int ac2zero, \\\n\
+    int bc2zero, \\\n\
+    float scale_a, \\\n\
+    float zp_a, \\\n\
+    float scale_b, \\\n\
+    float zp_b, \\\n\
+    float scale_out, \\\n\
+    float zp_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+    float4 sum = (float4)(0); \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(; coord.z < K;) \\\n\
+    { \\\n\
+        float4 tempA0; \\\n\
+        float4 tempB0; \\\n\
+ \\\n\
+        tempA0 = convert_float4(read_image_type(inputA, coord.yz)); \\\n\
+        tempB0 = convert_float4(read_image_type(inputB, coord.xz)); \\\n\
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \\\n\
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \\\n\
+        coord.z++; \\\n\
+ \\\n\
+        sum = sum + tempA0 * tempB0; \\\n\
+    } \\\n\
+    sum.x = sum.x * scale_out + zp_out; \\\n\
+    dst = convert_type(sum); \\\n\
+ \\\n\
+    write_image_type(output, coord.xy, dst); \\\n\
+}\n\
+GEMM_TRANSA_2D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);\n\
+GEMM_TRANSA_2D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);\n\
+GEMM_TRANSA_2D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
+\n\
+#define GEMM_TRANSA_3D(name, dst_type, read_image_type, convert_type, write_image_type) \\\n\
+__kernel void gemm_transa_##name##_3D( \\\n\
+    __read_only image2d_array_t   inputA, \\\n\
+    __read_only image2d_array_t   inputB, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int M, \\\n\
+    int K, \\\n\
+    int N, \\\n\
+    int ac2zero, \\\n\
+    int bc2zero, \\\n\
+    float scale_a, \\\n\
+    float zp_a, \\\n\
+    float scale_b, \\\n\
+    float zp_b, \\\n\
+    float scale_out, \\\n\
+    float zp_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\
+    int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\
+ \\\n\
+    float4 sum = (float4)(0); \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(; coord_a.y < K;) \\\n\
+    { \\\n\
+        float4 tempA0; \\\n\
+        float4 tempB0; \\\n\
+ \\\n\
+        tempA0 = convert_float4(read_image_type(inputA, coord_a)); \\\n\
+        tempB0 = convert_float4(read_image_type(inputB, coord_b)); \\\n\
+        tempA0.x = (tempA0.x - zp_a) * scale_a; \\\n\
+        tempB0.x = (tempB0.x - zp_b) * scale_b; \\\n\
+        coord_a.y++; \\\n\
+        coord_b.y++; \\\n\
+ \\\n\
+        sum = sum + tempA0 * tempB0; \\\n\
+    } \\\n\
+    sum.x = sum.x * scale_out + zp_out; \\\n\
+    dst = convert_type(sum); \\\n\
+ \\\n\
+    coord_b.y = gidy; \\\n\
+    coord_b.z = get_global_id(2); \\\n\
+    write_image_type(output, coord_b, dst); \\\n\
+}\n\
+GEMM_TRANSA_3D(I8I8toI8,int4,read_imagei,convert_int4,write_imagei);\n\
+GEMM_TRANSA_3D(U8U8toU8,uint4,read_imageui,convert_uint4,write_imageui);\n\
+GEMM_TRANSA_3D(U8U8toF32,float4,read_imageui,convert_float4,write_imagef);\n\
 "; /* end of matrixmul_transA_cl*/
 
 static const char maximum_cl[] = "__kernel void maximum_FP32FP32toFP32\n\
@@ -54157,6 +57762,139 @@ __kernel void maxpoolwithargmax_I32toI32_I32_2D(\n\
 }\n\
 "; /* end of maxpoolwithargmax_2d_cl*/
 
+static const char maxunpool_cl[] = "\n\
+#define MAXUNPOOL(name, read_type, read_image_type, write_type, convert_type, writeimage_type) \\\n\
+__kernel void maxunpool_##name( \\\n\
+    __read_only  image2d_array_t  input0, \\\n\
+    __read_only  image2d_array_t  input1, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              width_nopad, \\\n\
+                 int              height_nopad, \\\n\
+                 int              width_in, \\\n\
+                 int              height_in, \\\n\
+                 int              batch, \\\n\
+                 int              pad_left, \\\n\
+                 int              pad_top, \\\n\
+                 float            inputScale, \\\n\
+                 float            inputTail, \\\n\
+                 float            outputScale, \\\n\
+                 float            outputTail \\\n\
+    ) \\\n\
+{ \\\n\
+    uint gidx = get_global_id(0); \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    uint gidz = get_global_id(2); \\\n\
+    int gidx_in, gidy_in, gidz_in; \\\n\
+    int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\
+    write_type dst = (write_type)(0); \\\n\
+    float4 dst_temp = (float4)(0); \\\n\
+    int i,j,k; \\\n\
+    if (gidx < pad_left || gidx >= width_nopad + pad_left || \\\n\
+        gidy < pad_top || gidy >= height_nopad + pad_top) \\\n\
+    { \\\n\
+        dst_temp.x = outputTail; \\\n\
+        dst = convert_type(dst_temp); \\\n\
+        writeimage_type(output, coord_out, dst); \\\n\
+        return; \\\n\
+    } \\\n\
+    gidx_in = gidx - pad_left; \\\n\
+    gidy_in = gidy - pad_top; \\\n\
+    gidz_in = gidz; \\\n\
+    int index = gidz_in * height_nopad * width_nopad + gidy_in * width_nopad + gidx_in; \\\n\
+    for (k = 0;k < batch;k++) \\\n\
+    { \\\n\
+        for (j = 0;j < height_in; j++) \\\n\
+        { \\\n\
+            for (i = 0;i < width_in; i++) \\\n\
+            { \\\n\
+                int index_useful = read_imagei(input1, (int4)(i,j,k,0)).x; \\\n\
+                if (index_useful == index) \\\n\
+                { \\\n\
+                    read_type src = read_image_type(input0, (int4)(i,j,k,0)); \\\n\
+                    dst_temp = convert_float4(src) * inputScale + inputTail; \\\n\
+                    dst = convert_type(dst_temp * outputScale + outputTail); \\\n\
+                    writeimage_type(output, coord_out, dst); \\\n\
+                    return; \\\n\
+                } \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+    dst_temp.x = outputTail; \\\n\
+    dst = convert_type(dst_temp); \\\n\
+    writeimage_type(output, coord_out, dst); \\\n\
+}\n\
+MAXUNPOOL(F32toF32,float4,read_imagef,float4,convert_float4,write_imagef)\n\
+MAXUNPOOL(F32toU32,float4,read_imagef,uint4, convert_uint4, write_imageui)\n\
+MAXUNPOOL(F32toI32,float4,read_imagef,int4,  convert_int4,  write_imagei)\n\
+\n\
+MAXUNPOOL(U32toU32,uint4,read_imageui,uint4, convert_uint4, write_imageui)\n\
+MAXUNPOOL(U32toF32,uint4,read_imageui,float4,convert_float4,write_imagef)\n\
+MAXUNPOOL(U32toI32,uint4,read_imageui,int4,  convert_int4,  write_imagei)\n\
+\n\
+MAXUNPOOL(I32toU32,int4,read_imagei,uint4, convert_uint4, write_imageui)\n\
+MAXUNPOOL(I32toF32,int4,read_imagei,float4,convert_float4,write_imagef)\n\
+MAXUNPOOL(I32toI32,int4,read_imagei,int4,  convert_int4,  write_imagei)\n\
+\n\
+__kernel void maxunpool_BF16toBF16(\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 int              width_nopad,\n\
+                 int              height_nopad,\n\
+                 int              width_in,\n\
+                 int              height_in,\n\
+                 int              batch,\n\
+                 int              pad_left,\n\
+                 int              pad_top,\n\
+                 float            inputScale,\n\
+                 float            inputTail,\n\
+                 float            outputScale,\n\
+                 float            outputTail\n\
+    )\n\
+{\n\
+    uint gidx = get_global_id(0);\n\
+    uint gidy = get_global_id(1);\n\
+    uint gidz = get_global_id(2);\n\
+    int gidx_in, gidy_in, gidz_in;\n\
+    int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\
+    uint4 dst = (uint4)(0);\n\
+    float4 dst_temp = (float4)(0);\n\
+    int i,j,k;\n\
+    if (gidx < pad_left || gidx >= width_nopad + pad_left ||\n\
+        gidy < pad_top || gidy >= height_nopad + pad_top)\n\
+    {\n\
+        dst_temp.x = 0;\n\
+        _viv_asm(COPY, dst, dst_temp, 16);\n\
+        dst.x = dst.x >> 16;\n\
+        write_imageui(output, coord_out, dst);\n\
+        return;\n\
+    }\n\
+    gidx_in = gidx - pad_left;\n\
+    gidy_in = gidy - pad_top;\n\
+    gidz_in = gidz;\n\
+    int index = gidz_in * height_nopad * width_nopad + gidy_in * width_nopad + gidx_in;\n\
+    for (k = 0;k < batch;k++)\n\
+    {\n\
+        for (j = 0;j < height_in; j++)\n\
+        {\n\
+            for (i = 0;i < width_in; i++)\n\
+            {\n\
+                int index_useful = read_imagei(input1, (int4)(i,j,k,0)).x;\n\
+                if (index_useful == index)\n\
+                {\n\
+                    uint4 src = read_imageui(input0, (int4)(i,j,k,0));\n\
+                    write_imageui(output, coord_out, src);\n\
+                    return;\n\
+                }\n\
+            }\n\
+        }\n\
+    }\n\
+    dst_temp.x = 0;\n\
+    _viv_asm(COPY, dst, dst_temp, 16);\n\
+    dst.x = dst.x >> 16;\n\
+    write_imageui(output, coord_out, dst);\n\
+}"; /* end of maxunpool_cl*/
+
 static const char minimum_cl[] = "__kernel void minimum_FP32FP32toFP32\n\
     (\n\
     __read_only  image2d_array_t    input0,\n\
@@ -55979,7 +59717,11 @@ static const char pow_cl[] = "__kernel void pow_FP32FP32toFP32\n\
     (\n\
     __read_only  image2d_array_t    input0,\n\
     __read_only  image2d_array_t    input1,\n\
-    __write_only image2d_array_t    output\n\
+    __write_only image2d_array_t    output,\n\
+                 float              inputScale,\n\
+                 float              inputTail,\n\
+                 float              outputScale,\n\
+                 float              outputTail\n\
     )\n\
 {\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
@@ -55992,7 +59734,8 @@ static const char pow_cl[] = "__kernel void pow_FP32FP32toFP32\n\
     float4  s0 = sign(src0);\n\
     int4 t0 = convert_int4(src1) & 1;\n\
     s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
-    dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ? (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);\n\
+    dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ?\n\
+         (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);\n\
 \n\
     write_imagef(output, coord, dst);\n\
 }\n\
@@ -56001,7 +59744,11 @@ __kernel void pow_FP32FP32toFP32_2D\n\
     (\n\
     __read_only  image2d_t    input0,\n\
     __read_only  image2d_t    input1,\n\
-    __write_only image2d_t    output\n\
+    __write_only image2d_t    output,\n\
+                 float        inputScale,\n\
+                 float        inputTail,\n\
+                 float        outputScale,\n\
+                 float        outputTail\n\
     )\n\
 {\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
@@ -56015,10 +59762,72 @@ __kernel void pow_FP32FP32toFP32_2D\n\
     int4 t0 = convert_int4(src1) & 1;\n\
     s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
 \n\
-    dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ? (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);\n\
+    dst.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ?\n\
+           (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);\n\
 \n\
     write_imagef(output, coord, dst);\n\
 }\n\
+\n\
+__kernel void pow_U32F32toU32(\n\
+    __read_only  image2d_array_t    input0,\n\
+    __read_only  image2d_array_t    input1,\n\
+    __write_only image2d_array_t    output,\n\
+                 float              inputScale,\n\
+                 float              inputTail,\n\
+                 float              outputScale,\n\
+                 float              outputTail\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    uint4 src0, dst;\n\
+    float4 src0_f, src1, dst_f;\n\
+    READ_IMAGEUI_2DARRAY(src0, input0, coord);\n\
+    READ_IMAGEF_2DARRAY(src1, input1, coord);\n\
+\n\
+    src0_f = convert_float4(src0) * inputScale + inputTail;\n\
+    float4  s0 = sign(src0_f);\n\
+    int4 t0 = convert_int4(src1) & 1;\n\
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
+    dst_f.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ?\n\
+           (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);\n\
+    dst.x = convert_uint(dst_f.x * outputScale + outputTail);\n\
+\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void pow_U32F32toU32_2D\n\
+    (\n\
+    __read_only  image2d_t    input0,\n\
+    __read_only  image2d_t    input1,\n\
+    __write_only image2d_t    output,\n\
+                 float        inputScale,\n\
+                 float        inputTail,\n\
+                 float        outputScale,\n\
+                 float        outputTail\n\
+    )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    uint4  src0 = read_imageui(input0, coord);\n\
+    float4 src1 = read_imagef(input1, coord);\n\
+\n\
+    float4 src0_f = (float4)(0);\n\
+    float4 dst_f  = (float4)(0);\n\
+    uint4  dst    = (uint4)(0);\n\
+\n\
+    src0_f.x = convert_float(src0.x) * inputScale + inputTail;\n\
+    float4  s0 = sign(src0_f);\n\
+    int4 t0 = convert_int4(src1) & 1;\n\
+    s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\
+\n\
+    dst_f.x = (src0.x == 0 && src1.x == 0) ? 1.0f : (src0.x != 0 ?\n\
+         (s0.x * exp2(src1.x * log2(fabs(src0.x)))) : 0.0f);\n\
+    dst.x = convert_uint(dst_f.x * outputScale + outputTail);\n\
+\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
 "; /* end of pow_cl*/
 
 static const char prelu_cl[] = "__kernel void prelu_FP32FP32toFP32\n\
@@ -58487,7 +62296,164 @@ __kernel void resize_nearest_U8toU8(\n\
 }\n\
 "; /* end of resize_nearest_cl*/
 
-static const char roi_align_cl[] = "\n\
+static const char reversesequence_cl[] = "#define REVERSESEQUENCE_axis2(name,src_type,readimage_type,\\\n\
+                    convert_type,dst_type,writeimage_type) \\\n\
+__kernel void reversesequence_##name( \\\n\
+    __read_only  image2d_array_t  input0, \\\n\
+    __read_only  image2d_t        input1, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inoutScale, \\\n\
+                 float            inoutTail \\\n\
+    ) \\\n\
+{ \\\n\
+    uint gidx = get_global_id(0); \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    uint gidz = get_global_id(2); \\\n\
+\\\n\
+    int4 coord_in = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_out = coord_in; \\\n\
+    src_type src = readimage_type(input0, coord_in); \\\n\
+    int src_index = read_imagei(input1, (int2)(gidz, 0)).x; \\\n\
+    float4 src_temp = convert_float4(src); \\\n\
+    dst_type dst = convert_type(src_temp * inoutScale + inoutTail); \\\n\
+    if (gidy >= src_index) \\\n\
+    { \\\n\
+        writeimage_type(output, coord_out, dst); \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        coord_out.y = src_index - 1 - coord_out.y; \\\n\
+        writeimage_type(output, coord_out, dst); \\\n\
+    } \\\n\
+}\n\
+REVERSESEQUENCE_axis2(F32toF32_axis2,float4,read_imagef,\\\n\
+                      convert_float4,float4,write_imagef)\n\
+REVERSESEQUENCE_axis2(F32toU32_axis2,float4,read_imagef,\\\n\
+                      convert_uint4, uint4, write_imageui)\n\
+REVERSESEQUENCE_axis2(F32toI32_axis2,float4,read_imagef,\\\n\
+                       convert_int4,  int4,  write_imagei)\n\
+REVERSESEQUENCE_axis2(I32toF32_axis2,int4,  read_imagei,\\\n\
+                       convert_float4,float4,write_imagef)\n\
+REVERSESEQUENCE_axis2(I32toU32_axis2,int4,  read_imagei,\\\n\
+                     convert_uint4, uint4, write_imageui)\n\
+REVERSESEQUENCE_axis2(I32toI32_axis2,int4,  read_imagei,\\\n\
+                      convert_int4,  int4,  write_imagei)\n\
+REVERSESEQUENCE_axis2(U32toF32_axis2,uint4, read_imageui,\\\n\
+                      convert_float4,float4,write_imagef)\n\
+REVERSESEQUENCE_axis2(U32toU32_axis2,uint4, read_imageui,\\\n\
+                     convert_uint4, uint4, write_imageui)\n\
+REVERSESEQUENCE_axis2(U32toI32_axis2,uint4, read_imageui,\\\n\
+                        convert_int4,  int4,  write_imagei)\n\
+\n\
+__kernel void reversesequence_BF16toBF16_axis2(\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_t        input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 float            inoutScale,\n\
+                 float            inoutTail\n\
+    )\n\
+{\n\
+    uint gidx = get_global_id(0);\n\
+    uint gidy = get_global_id(1);\n\
+    uint gidz = get_global_id(2);\n\
+\n\
+    int4 coord_in = (int4)(gidx, gidy, gidz, 0);\n\
+    int4 coord_out = coord_in;\n\
+    uint4 src = read_imageui(input0, coord_in);\n\
+    int src_index = read_imagei(input1, (int2)(gidz, 0)).x;\n\
+    uint4 dst = src;\n\
+    if (gidy >= src_index)\n\
+    {\n\
+        write_imageui(output, coord_out, dst);\n\
+    }\n\
+    else\n\
+    {\n\
+        coord_out.y = src_index - 1 - coord_out.y;\n\
+        write_imageui(output, coord_out, dst);\n\
+    }\n\
+}\n\
+\n\
+\n\
+#define REVERSESEQUENCE_axis1(name,src_type,readimage_type,\\\n\
+                             convert_type,dst_type,writeimage_type) \\\n\
+__kernel void reversesequence_##name( \\\n\
+    __read_only  image2d_array_t  input0, \\\n\
+    __read_only  image2d_t        input1, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 float            inoutScale, \\\n\
+                 float            inoutTail \\\n\
+    ) \\\n\
+{ \\\n\
+    uint gidx = get_global_id(0); \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    uint gidz = get_global_id(2); \\\n\
+\\\n\
+    int4 coord_in = (int4)(gidx, gidy, gidz, 0); \\\n\
+    int4 coord_out = coord_in; \\\n\
+    src_type src = readimage_type(input0, coord_in); \\\n\
+    int src_index = read_imagei(input1, (int2)(gidy, 0)).x; \\\n\
+    float4 src_temp = convert_float4(src); \\\n\
+    dst_type dst = convert_type(src_temp * inoutScale + inoutTail ); \\\n\
+    if (gidz >= src_index) \\\n\
+    { \\\n\
+        writeimage_type(output, coord_out, dst); \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        coord_out.z = src_index - 1 - coord_out.z; \\\n\
+        writeimage_type(output, coord_out, dst); \\\n\
+    } \\\n\
+}\n\
+REVERSESEQUENCE_axis1(F32toF32_axis1,float4,read_imagef,\\\n\
+                     convert_float4,float4,write_imagef)\n\
+REVERSESEQUENCE_axis1(F32toU32_axis1,float4,read_imagef,\\\n\
+                     convert_uint4, uint4, write_imageui)\n\
+REVERSESEQUENCE_axis1(F32toI32_axis1,float4,read_imagef,\\\n\
+                     convert_int4,  int4,  write_imagei)\n\
+REVERSESEQUENCE_axis1(I32toF32_axis1,int4,  read_imagei,\\\n\
+                     convert_float4,float4,write_imagef)\n\
+REVERSESEQUENCE_axis1(I32toU32_axis1,int4,  read_imagei,\\\n\
+                     convert_uint4, uint4, write_imageui)\n\
+REVERSESEQUENCE_axis1(I32toI32_axis1,int4,  read_imagei,\\\n\
+                     convert_int4,  int4,  write_imagei)\n\
+REVERSESEQUENCE_axis1(U32toF32_axis1,uint4, read_imageui,\\\n\
+                     convert_float4,float4,write_imagef)\n\
+REVERSESEQUENCE_axis1(U32toU32_axis1,uint4, read_imageui,\\\n\
+                     convert_uint4, uint4, write_imageui)\n\
+REVERSESEQUENCE_axis1(U32toI32_axis1,uint4, read_imageui,\\\n\
+                      convert_int4,  int4,  write_imagei)\n\
+\n\
+__kernel void reversesequence_BF16toBF16_axis1(\n\
+    __read_only  image2d_array_t  input0,\n\
+    __read_only  image2d_t        input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 float            inoutScale,\n\
+                 float            inoutTail\n\
+    )\n\
+{\n\
+    uint gidx = get_global_id(0);\n\
+    uint gidy = get_global_id(1);\n\
+    uint gidz = get_global_id(2);\n\
+\n\
+    int4 coord_in = (int4)(gidx, gidy, gidz, 0);\n\
+    int4 coord_out = coord_in;\n\
+    uint4 src = read_imageui(input0, coord_in);\n\
+    int src_index = read_imagei(input1, (int2)(gidy, 0)).x;\n\
+    uint4 dst = src;\n\
+    if (gidz >= src_index)\n\
+    {\n\
+        write_imageui(output, coord_out, dst);\n\
+    }\n\
+    else\n\
+    {\n\
+        coord_out.z = src_index - 1 - coord_out.z;\n\
+        write_imageui(output, coord_out, dst);\n\
+    }\n\
+}\n\
+"; /* end of reversesequence_cl*/
+
+static const char roi_align_cl[] = "#define VSI_NN_ROI_ALIGN_ANDROID 0\n\
+\n\
 inline float roi_align_1x1\n\
 (\n\
     __read_only  image2d_array_t  input,\n\
@@ -58497,7 +62463,8 @@ inline float roi_align_1x1\n\
                  int2   grid_size,\n\
                  float2 rcp_of_grid_size,\n\
                  int    pz,\n\
-                 int4   max_spatial_dims\n\
+                 int4   max_spatial_dims,\n\
+                 int    platform_type\n\
 )\n\
 {\n\
     float sum = 0;\n\
@@ -58512,10 +62479,21 @@ inline float roi_align_1x1\n\
             int2 xy_low  = convert_int2(pos);\n\
             int2 xy_high = xy_low + 1;\n\
 \n\
-            if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||\n\
-                xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )\n\
+            if (VSI_NN_ROI_ALIGN_ANDROID == platform_type)\n\
             {\n\
-                continue;\n\
+                if (xy_low.x > max_spatial_dims.x || xy_low.x < -1 ||\n\
+                    xy_low.y > max_spatial_dims.y || xy_low.y < -1 )\n\
+                {\n\
+                    continue;\n\
+                }\n\
+            }\n\
+            else\n\
+            {\n\
+                if (pos.x > max_spatial_dims.x || pos.x < -1 ||\n\
+                    pos.y > max_spatial_dims.y || pos.y < -1 )\n\
+                {\n\
+                    continue;\n\
+                }\n\
             }\n\
 \n\
             float2 lxy = pos - floor(pos);\n\
@@ -58565,7 +62543,8 @@ __kernel void roi_align_F32_F32toF32\n\
                  float           sampling_x_ratio,\n\
                  float           sampling_y_ratio,\n\
                  int             depth,\n\
-                 int             dtype\n\
+                 int             dtype,\n\
+                 int             platform_type\n\
 )\n\
 {\n\
     int px = get_global_id(0);\n\
@@ -58611,7 +62590,8 @@ __kernel void roi_align_F32_F32toF32\n\
                        grid_size_xy,\n\
                        rcp_of_grid_size,\n\
                        kz,\n\
-                       max_spatial_dims);\n\
+                       max_spatial_dims,\n\
+                       platform_type);\n\
 \n\
         if (dtype == TYPE_FLOAT16)\n\
         {\n\
@@ -58627,10 +62607,9 @@ __kernel void roi_align_F32_F32toF32\n\
         }\n\
         else\n\
         {\n\
-            Tensor out_t =  create_tensor_from_image2d_array(output, 4);\n\
-            float *output_ptr = (float *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));\n\
-\n\
-            output_ptr[0] = interp.x;\n\
+            float4 dst = (float4)(interp.x,0,0,0);\n\
+            int4 coord_dst = (int4)(px, py, kz1, 0);\n\
+            write_imagef(output,coord_dst,dst);\n\
         }\n\
     }\n\
 }\n\
@@ -58646,7 +62625,8 @@ inline float roi_align_1x1_U8toF32\n\
                 int2             grid_size,\n\
                 float2           rcp_of_grid_size,\n\
                 int              pz,\n\
-                int4             max_spatial_dims\n\
+                int4             max_spatial_dims,\n\
+                int              platform_type\n\
 )\n\
 {\n\
     float sum = 0;\n\
@@ -58657,41 +62637,52 @@ inline float roi_align_1x1_U8toF32\n\
         {\n\
             float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);\n\
             float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;\n\
-    \n\
+\n\
             int2 xy_low  = convert_int2(pos);\n\
             int2 xy_high = xy_low + 1;\n\
-    \n\
+\n\
             float2 lxy = pos - floor(pos);\n\
             float2 zero = 0;\n\
-    \n\
-            if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||\n\
-                xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )\n\
+\n\
+            if (VSI_NN_ROI_ALIGN_ANDROID == platform_type)\n\
             {\n\
-                continue;\n\
+                if (xy_low.x > max_spatial_dims.x || xy_low.x < -1 ||\n\
+                    xy_low.y > max_spatial_dims.y || xy_low.y < -1 )\n\
+                {\n\
+                    continue;\n\
+                }\n\
             }\n\
-    \n\
+            else\n\
+            {\n\
+                if (pos.x > max_spatial_dims.x || pos.x < -1 ||\n\
+                    pos.y > max_spatial_dims.y || pos.y < -1 )\n\
+                {\n\
+                    continue;\n\
+                }\n\
+            }\n\
+\n\
             lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;\n\
-    \n\
+\n\
             float hy = 1.0f - lxy.y;\n\
             float hx = 1.0f - lxy.x;\n\
-    \n\
+\n\
             float w1 = hy * hx;\n\
             float w2 = lxy.x - lxy.x * lxy.y;\n\
             float w3 = lxy.y - lxy.x * lxy.y;\n\
             float w4 = lxy.y * lxy.x;\n\
-    \n\
+\n\
             uint4 data;\n\
             data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;\n\
             data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;\n\
             data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;\n\
             data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;\n\
-    \n\
+\n\
             float4 value = convert_float4(data) * input_scale + input_tail;\n\
-    \n\
+\n\
             sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w;\n\
         }\n\
     }\n\
-    \n\
+\n\
     return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);\n\
 \n\
 }\n\
@@ -58715,7 +62706,8 @@ __kernel void roi_align_U8_U16toU8\n\
                  float           sampling_x_ratio,\n\
                  float           sampling_y_ratio,\n\
                  int             depth,\n\
-                 int             dtype\n\
+                 int             dtype,\n\
+                 int             platform_type\n\
 )\n\
 {\n\
     int px = get_global_id(0);\n\
@@ -58763,7 +62755,8 @@ __kernel void roi_align_U8_U16toU8\n\
                        grid_size_xy,\n\
                        rcp_of_grid_size,\n\
                        kz,\n\
-                       max_spatial_dims);\n\
+                       max_spatial_dims,\n\
+                       platform_type);\n\
 \n\
         uchar dst;\n\
         interp.x = interp.x * output_scale + output_zp;\n\
@@ -58772,7 +62765,7 @@ __kernel void roi_align_U8_U16toU8\n\
 \n\
         Tensor out_t =  create_tensor_from_image2d_array(output, 1);\n\
         uchar *output_ptr = (uchar *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));\n\
-        \n\
+\n\
         output_ptr[0] = dst;\n\
     }\n\
 }"; /* end of roi_align_cl*/
@@ -60457,7 +64450,7 @@ __kernel void swish_F32toF32_2D(\n\
     src   = convert_float4(src0) * inputScale - inputTail; \\\n\
     tmp.x = sigmoid_(src.x * beta, logE); \\\n\
     data.x = src.x * tmp.x; \\\n\
-    uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\
+    uint4 dst = convert_uint4_rte(data * outputScale + outputZP); \\\n\
     write_imageui(output, coord, dst);\n\
 \n\
 __kernel void swish_U8toU8(\n\
@@ -60525,7 +64518,42 @@ __kernel void swish_I32toI32_2D(\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
     SWISH_I32_I32_PROCESS()\n\
 }\n\
-"; /* end of swish_cl*/
+\n\
+#define SWISH_F32_U8_PROCESS() \\\n\
+    float4 src, tmp, data; \\\n\
+    src = read_imagef(input, coord); \\\n\
+    tmp.x = sigmoid_(src.x * beta, logE); \\\n\
+    data.x = src.x * tmp.x; \\\n\
+    uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\
+    write_imageui(output, coord, dst);\n\
+\n\
+__kernel void swish_F32toU8(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                 float            inputScale,\n\
+                 float            inputTail,\n\
+                 float            outputScale,\n\
+                 float            outputZP,\n\
+                 float            beta,\n\
+                 float            logE)\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    SWISH_F32_U8_PROCESS()\n\
+}\n\
+\n\
+__kernel void swish_F32toU8_2D(\n\
+    __read_only  image2d_t        input,\n\
+    __write_only image2d_t        output,\n\
+                 float            inputScale,\n\
+                 float            inputTail,\n\
+                 float            outputScale,\n\
+                 float            outputZP,\n\
+                 float            beta,\n\
+                 float            logE)\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    SWISH_F32_U8_PROCESS()\n\
+}"; /* end of swish_cl*/
 
 static const char tile_cl[] = "\n\
 #define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \\\n\
@@ -60626,6 +64654,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
   __read_only  image2d_t input, \\\n\
   __write_only image2d_t output, \\\n\
   __write_only image2d_t indices, \\\n\
+               float     input_scale, \\\n\
+               float     input_tail, \\\n\
+               float     output_scale, \\\n\
+               float     output_tail, \\\n\
                int       num_stages, \\\n\
                int       width \\\n\
   ) \\\n\
@@ -60710,6 +64742,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
   __read_only  image2d_t input, \\\n\
   __write_only image2d_t output, \\\n\
   __write_only image2d_t indices, \\\n\
+               float     input_scale, \\\n\
+               float     input_tail, \\\n\
+               float     output_scale, \\\n\
+               float     output_tail, \\\n\
                int       num_stages, \\\n\
                int       width \\\n\
   ) \\\n\
@@ -60794,6 +64830,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
   __read_only  image2d_t input, \\\n\
   __write_only image2d_t output, \\\n\
   __write_only image2d_t indices, \\\n\
+               float     input_scale, \\\n\
+               float     input_tail, \\\n\
+               float     output_scale, \\\n\
+               float     output_tail, \\\n\
                int       num_stages, \\\n\
                int       width \\\n\
   ) \\\n\
@@ -60871,7 +64911,182 @@ TOPK_I32(1 << 3, 3)\n\
 TOPK_I32(1 << 4, 4)\n\
 TOPK_I32(1 << 5, 5)\n\
 TOPK_I32(1 << 6, 6)\n\
-"; /* end of topk_cl*/
+\n\
+#define TOPK_F32toU32(LOCAL_SIZE0, STAGES) \\\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toU32_I32 \\\n\
+ ( \\\n\
+  __read_only  image2d_t input, \\\n\
+  __write_only image2d_t output, \\\n\
+  __write_only image2d_t indices, \\\n\
+               float     input_scale, \\\n\
+               float     input_tail, \\\n\
+               float     output_scale, \\\n\
+               float     output_tail, \\\n\
+               int       num_stages, \\\n\
+               int       width \\\n\
+  ) \\\n\
+ { \\\n\
+    uint local_id = get_local_id(0); \\\n\
+    uint work_group_size = get_local_size(0); \\\n\
+    uint offset = 0; \\\n\
+ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    __local float local_data[128]; \\\n\
+    __local uint local_indices[128]; \\\n\
+ \\\n\
+    float left = read_imagef(input, coord.xy).x; \\\n\
+    coord.z += work_group_size; \\\n\
+    float data = read_imagef(input, coord.zy).x; \\\n\
+    float right = coord.z < width ? data : -2147483647.0f; \\\n\
+ \\\n\
+    local_data[local_id] = left; \\\n\
+    local_indices[local_id] = local_id; \\\n\
+    local_data[local_id + work_group_size] = right; \\\n\
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \\\n\
+ \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
+    { \\\n\
+        uint signo = (local_id >> stage) & 1; \\\n\
+ \\\n\
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+        { \\\n\
+            uint postShift = (stage - passOfStage); \\\n\
+            uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \\\n\
+            uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+            uint left_idx = local_indices[left_id]; \\\n\
+            uint right_idx = local_indices[right_id]; \\\n\
+ \\\n\
+            float left_elem = local_data[left_id]; \\\n\
+            float right_elem = local_data[right_id]; \\\n\
+ \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
+            { \\\n\
+                local_data[left_id] = right_elem; \\\n\
+                local_data[right_id] = left_elem; \\\n\
+ \\\n\
+                local_indices[left_id] = right_idx; \\\n\
+                local_indices[right_id] = left_idx; \\\n\
+            } \\\n\
+ \\\n\
+            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    uint4 dst; \\\n\
+    dst.x = convert_uint(local_data[local_id] * output_scale + output_tail); \\\n\
+    dst.y = convert_uint(local_data[local_id + work_group_size] * output_scale + output_tail); \\\n\
+    write_imageui(output, coord.xy, dst.xxxx); \\\n\
+    write_imageui(output, coord.zy, dst.yyyy); \\\n\
+ \\\n\
+    int4 index; \\\n\
+    index.x = ((int*)local_indices)[local_id]; \\\n\
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \\\n\
+ \\\n\
+    write_imagei(indices, coord.xy, index.xxxx); \\\n\
+    write_imagei(indices, coord.zy, index.yyyy); \\\n\
+ }\n\
+\n\
+TOPK_F32toU32(1 << 0, 0)\n\
+TOPK_F32toU32(1 << 1, 1)\n\
+TOPK_F32toU32(1 << 2, 2)\n\
+TOPK_F32toU32(1 << 3, 3)\n\
+TOPK_F32toU32(1 << 4, 4)\n\
+TOPK_F32toU32(1 << 5, 5)\n\
+TOPK_F32toU32(1 << 6, 6)\n\
+\n\
+#define TOPK_F32toI32(LOCAL_SIZE0, STAGES) \\\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toI32_I32 \\\n\
+ ( \\\n\
+  __read_only  image2d_t input, \\\n\
+  __write_only image2d_t output, \\\n\
+  __write_only image2d_t indices, \\\n\
+               float     input_scale, \\\n\
+               float     input_tail, \\\n\
+               float     output_scale, \\\n\
+               float     output_tail, \\\n\
+               int       num_stages, \\\n\
+               int       width \\\n\
+  ) \\\n\
+ { \\\n\
+    uint local_id = get_local_id(0); \\\n\
+    uint work_group_size = get_local_size(0); \\\n\
+    uint offset = 0; \\\n\
+ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    __local float local_data[128]; \\\n\
+    __local uint local_indices[128]; \\\n\
+ \\\n\
+    float left = read_imagef(input, coord.xy).x; \\\n\
+    coord.z += work_group_size; \\\n\
+    float data = read_imagef(input, coord.zy).x; \\\n\
+    float right = coord.z < width ? data : -2147483647.0f; \\\n\
+ \\\n\
+    local_data[local_id] = left; \\\n\
+    local_indices[local_id] = local_id; \\\n\
+    local_data[local_id + work_group_size] = right; \\\n\
+    local_indices[local_id + work_group_size] = local_id + work_group_size; \\\n\
+ \\\n\
+    barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+ \\\n\
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
+    { \\\n\
+        uint signo = (local_id >> stage) & 1; \\\n\
+ \\\n\
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+        { \\\n\
+            uint postShift = (stage - passOfStage); \\\n\
+            uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+            uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \\\n\
+            uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+            uint left_idx = local_indices[left_id]; \\\n\
+            uint right_idx = local_indices[right_id]; \\\n\
+ \\\n\
+            float left_elem = local_data[left_id]; \\\n\
+            float right_elem = local_data[right_id]; \\\n\
+ \\\n\
+            if ((left_elem < right_elem) ^ signo) \\\n\
+            { \\\n\
+                local_data[left_id] = right_elem; \\\n\
+                local_data[right_id] = left_elem; \\\n\
+ \\\n\
+                local_indices[left_id] = right_idx; \\\n\
+                local_indices[right_id] = left_idx; \\\n\
+            } \\\n\
+ \\\n\
+            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    int4 dst; \\\n\
+    dst.x = convert_int(local_data[local_id] * output_scale + output_tail); \\\n\
+    dst.y = convert_int(local_data[local_id + work_group_size] * output_scale + output_tail); \\\n\
+    write_imagei(output, coord.xy, dst.xxxx); \\\n\
+    write_imagei(output, coord.zy, dst.yyyy); \\\n\
+ \\\n\
+    int4 index; \\\n\
+    index.x = ((int*)local_indices)[local_id]; \\\n\
+    index.y = ((int*)local_indices)[local_id + work_group_size]; \\\n\
+ \\\n\
+    write_imagei(indices, coord.xy, index.xxxx); \\\n\
+    write_imagei(indices, coord.zy, index.yyyy); \\\n\
+ }\n\
+\n\
+TOPK_F32toI32(1 << 0, 0)\n\
+TOPK_F32toI32(1 << 1, 1)\n\
+TOPK_F32toI32(1 << 2, 2)\n\
+TOPK_F32toI32(1 << 3, 3)\n\
+TOPK_F32toI32(1 << 4, 4)\n\
+TOPK_F32toI32(1 << 5, 5)\n\
+TOPK_F32toI32(1 << 6, 6)"; /* end of topk_cl*/
 
 static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X    (32)\n\
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\
@@ -60881,6 +65096,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                image2d_t indices_t,\n\
   __write_only image2d_t output,\n\
   __write_only image2d_t indices,\n\
+               float     input_scale,\n\
+               float     input_tail,\n\
+               float     output_scale,\n\
+               float     output_tail,\n\
                int       width\n\
   )\n\
  {\n\
@@ -60990,6 +65209,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                image2d_t indices_t,\n\
   __write_only image2d_t output,\n\
   __write_only image2d_t indices,\n\
+               float     input_scale,\n\
+               float     input_tail,\n\
+               float     output_scale,\n\
+               float     output_tail,\n\
                int       width\n\
   )\n\
  {\n\
@@ -61099,6 +65322,10 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                image2d_t indices_t,\n\
   __write_only image2d_t output,\n\
   __write_only image2d_t indices,\n\
+               float     input_scale,\n\
+               float     input_tail,\n\
+               float     output_scale,\n\
+               float     output_tail,\n\
                int       width\n\
   )\n\
  {\n\
@@ -61199,7 +65426,239 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
         write_imagei(output, coord.xy, data);\n\
         write_imagei(indices, coord.xy, index);\n\
     }\n\
-}"; /* end of topk_odd_even_sort_cl*/
+}\n\
+"; /* end of topk_odd_even_sort_cl*/
+
+static const char topk_odd_even_sort2_cl[] = "#define LOCAL_SIZE_X    (32)\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toU32_I32\n\
+ (\n\
+  __read_only  image2d_t input,\n\
+               image2d_t input_t,\n\
+               image2d_t indices_t,\n\
+  __write_only image2d_t output,\n\
+  __write_only image2d_t indices,\n\
+               float     input_scale,\n\
+               float     input_tail,\n\
+               float     output_scale,\n\
+               float     output_tail,\n\
+               int       width\n\
+  )\n\
+ {\n\
+    uint lid = get_local_id(0);\n\
+    uint work_group_size = get_local_size(0);\n\
+    uint offset = 0;\n\
+\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        float4 data = read_imagef(input, coord.xy);\n\
+\n\
+        write_imagef(input_t, coord.xy, data);\n\
+        write_imagei(indices_t, coord.xy, coord.xxxx);\n\
+    }\n\
+\n\
+    __local int sorted[1];\n\
+    int width_minus_one = width - 1;\n\
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
+\n\
+    int x_start = lid * num_pixels_per_thread;\n\
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
+\n\
+    sorted[0] = 0;\n\
+\n\
+    while (1)\n\
+    {\n\
+        if (lid == 0)\n\
+        {\n\
+            *sorted = 0;\n\
+        }\n\
+        int swapped = 0;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        // odd-even\n\
+        coord.x = x_start;\n\
+        coord.z = x_start + 1;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            float4 left = read_imagef(input_t, coord.xy);\n\
+            float4 right = read_imagef(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imagef(input_t, coord.xy, right);\n\
+                write_imagef(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        // even-odd\n\
+        coord.x = x_start + 1;\n\
+        coord.z = x_start + 2;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            float4 left = read_imagef(input_t, coord.xy);\n\
+            float4 right = read_imagef(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imagef(input_t, coord.xy, right);\n\
+                write_imagef(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        atomic_add(sorted, swapped);\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        if (*sorted == 0)\n\
+            break;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+    }\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        float4 data = read_imagef(input_t, coord.xy);\n\
+        int4 index = read_imagei(indices_t, coord.xy);\n\
+\n\
+        uint4 dst;\n\
+        dst = convert_uint4(data * output_scale + output_tail);\n\
+        write_imageui(output, coord.xy, dst);\n\
+        write_imagei(indices, coord.xy, index);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toI32_I32\n\
+ (\n\
+  __read_only  image2d_t input,\n\
+               image2d_t input_t,\n\
+               image2d_t indices_t,\n\
+  __write_only image2d_t output,\n\
+  __write_only image2d_t indices,\n\
+               float     input_scale,\n\
+               float     input_tail,\n\
+               float     output_scale,\n\
+               float     output_tail,\n\
+               int       width\n\
+  )\n\
+ {\n\
+    uint lid = get_local_id(0);\n\
+    uint work_group_size = get_local_size(0);\n\
+    uint offset = 0;\n\
+\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        float4 data = read_imagef(input, coord.xy);\n\
+\n\
+        write_imagef(input_t, coord.xy, data);\n\
+        write_imagei(indices_t, coord.xy, coord.xxxx);\n\
+    }\n\
+\n\
+    __local int sorted[1];\n\
+    int width_minus_one = width - 1;\n\
+    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\
+    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\
+\n\
+    int x_start = lid * num_pixels_per_thread;\n\
+    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\
+\n\
+    sorted[0] = 0;\n\
+\n\
+    while (1)\n\
+    {\n\
+        if (lid == 0)\n\
+        {\n\
+            *sorted = 0;\n\
+        }\n\
+        int swapped = 0;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        // odd-even\n\
+        coord.x = x_start;\n\
+        coord.z = x_start + 1;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            float4 left = read_imagef(input_t, coord.xy);\n\
+            float4 right = read_imagef(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imagef(input_t, coord.xy, right);\n\
+                write_imagef(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        // even-odd\n\
+        coord.x = x_start + 1;\n\
+        coord.z = x_start + 2;\n\
+        for (; coord.x < x_end; )\n\
+        {\n\
+            float4 left = read_imagef(input_t, coord.xy);\n\
+            float4 right = read_imagef(input_t, coord.zy);\n\
+\n\
+            if (left.x < right.x)\n\
+            {\n\
+                int4 l_index = read_imagei(indices_t, coord.xy);\n\
+                int4 r_index = read_imagei(indices_t, coord.zy);\n\
+                swapped = 1;\n\
+\n\
+                write_imagef(input_t, coord.xy, right);\n\
+                write_imagef(input_t, coord.zy, left);\n\
+\n\
+                write_imagei(indices_t, coord.xy, r_index);\n\
+                write_imagei(indices_t, coord.zy, l_index);\n\
+            }\n\
+\n\
+            coord.xz = coord.xz + 2;\n\
+        }\n\
+\n\
+        atomic_add(sorted, swapped);\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+\n\
+        if (*sorted == 0)\n\
+            break;\n\
+        barrier(CLK_GLOBAL_MEM_FENCE);\n\
+    }\n\
+\n\
+    for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\
+    {\n\
+        float4 data = read_imagef(input_t, coord.xy);\n\
+        int4 index = read_imagei(indices_t, coord.xy);\n\
+\n\
+        int4 dst;\n\
+        dst = convert_int4(data * output_scale + output_tail);\n\
+        write_imagei(output, coord.xy, dst);\n\
+        write_imagei(indices, coord.xy, index);\n\
+    }\n\
+}"; /* end of topk_odd_even_sort2_cl*/
 
 static const char upsample_cl[] = "\n\
 #define UPSAMPLE_PROCESS(data_type, read_fun, write_fun) \\\n\
@@ -61427,6 +65886,12 @@ static const source_map_t evis_resource[] =
     {"argmin_axis2_vx", argmin_axis2_vx},
     {"batchnorm_single_vx", batchnorm_single_vx},
     {"batchnorm_single_f32_vx", batchnorm_single_f32_vx},
+    {"bilinear_grid_sample_BF16_to_BF16_vx", bilinear_grid_sample_BF16_to_BF16_vx},
+    {"bilinear_grid_sample_F16_to_F16_vx", bilinear_grid_sample_F16_to_F16_vx},
+    {"bilinear_grid_sample_F16_to_U8_vx", bilinear_grid_sample_F16_to_U8_vx},
+    {"bilinear_grid_sample_I16_to_I16_vx", bilinear_grid_sample_I16_to_I16_vx},
+    {"bilinear_grid_sample_I8_to_I8_vx", bilinear_grid_sample_I8_to_I8_vx},
+    {"bilinear_grid_sample_U8_to_U8_vx", bilinear_grid_sample_U8_to_U8_vx},
     {"bucketize_vx", bucketize_vx},
     {"cast_vx", cast_vx},
     {"clip_F16_vx", clip_F16_vx},
@@ -61466,6 +65931,8 @@ static const source_map_t evis_resource[] =
     {"gather_nd_2d_mix_vx", gather_nd_2d_mix_vx},
     {"gather_nd_3d_vx", gather_nd_3d_vx},
     {"gather_nd_3d_mix_vx", gather_nd_3d_mix_vx},
+    {"gather_nd_batch_vx", gather_nd_batch_vx},
+    {"gather_nd_batch_2d_vx", gather_nd_batch_2d_vx},
     {"gather_nd_mix_vx", gather_nd_mix_vx},
     {"get_matrix_vx", get_matrix_vx},
     {"group_normalization_0_vx", group_normalization_0_vx},
@@ -61483,6 +65950,9 @@ static const source_map_t evis_resource[] =
     {"instance_normalization_1_vx", instance_normalization_1_vx},
     {"instance_normalization_2_vx", instance_normalization_2_vx},
     {"instance_normalization_3_vx", instance_normalization_3_vx},
+    {"l1norm_axis0_vx", l1norm_axis0_vx},
+    {"l1norm_axis1_vx", l1norm_axis1_vx},
+    {"l1norm_axis2_vx", l1norm_axis2_vx},
     {"l2normalizescale_axis0_vx", l2normalizescale_axis0_vx},
     {"l2normalizescale_axis0_2d_vx", l2normalizescale_axis0_2d_vx},
     {"l2normalizescale_axis1_vx", l2normalizescale_axis1_vx},
@@ -61531,6 +66001,7 @@ static const source_map_t evis_resource[] =
     {"matrixmul_u8_vx", matrixmul_u8_vx},
     {"matrixmul_u8f16_f16_vx", matrixmul_u8f16_f16_vx},
     {"matrixmul_u8f16_u8_vx", matrixmul_u8f16_u8_vx},
+    {"matrixmul_u8i16_i16_vx", matrixmul_u8i16_i16_vx},
     {"matrixmul_u8u8_f16_vx", matrixmul_u8u8_f16_vx},
     {"maximum_0_vx", maximum_0_vx},
     {"maximum_1_vx", maximum_1_vx},
@@ -61621,6 +66092,7 @@ static const source_map_t evis_resource[] =
     {"scatter_nd_update_vx", scatter_nd_update_vx},
     {"scatter_nd_update_atom_vx", scatter_nd_update_atom_vx},
     {"scatter_nd_update_big_vx", scatter_nd_update_big_vx},
+    {"scatter_nd_update_special_vx", scatter_nd_update_special_vx},
     {"select_vx", select_vx},
     {"sequence_mask_vx", sequence_mask_vx},
     {"signal_frame_vx", signal_frame_vx},
@@ -61649,7 +66121,9 @@ static const source_map_t cl_resource[] =
     {"argmin_axis0_cl", argmin_axis0_cl},
     {"argmin_axis1_cl", argmin_axis1_cl},
     {"argmin_axis2_cl", argmin_axis2_cl},
+    {"avg_pool3d_cl", avg_pool3d_cl},
     {"batchnorm_single_cl", batchnorm_single_cl},
+    {"bilinear_grid_sample_cl", bilinear_grid_sample_cl},
     {"bucketize_cl", bucketize_cl},
     {"cast_cl", cast_cl},
     {"clip_BF16_cl", clip_BF16_cl},
@@ -61666,10 +66140,13 @@ static const source_map_t cl_resource[] =
     {"erf_cl", erf_cl},
     {"floordiv_cl", floordiv_cl},
     {"gather_cl", gather_cl},
+    {"gather_array_cl", gather_array_cl},
     {"gather_batch_cl", gather_batch_cl},
     {"gather_elements_cl", gather_elements_cl},
     {"gather_nd_cl", gather_nd_cl},
     {"gather_nd_3d_cl", gather_nd_3d_cl},
+    {"gather_nd_batch_cl", gather_nd_batch_cl},
+    {"globallppool_cl", globallppool_cl},
     {"group_normalization_f32_cl", group_normalization_f32_cl},
     {"group_normalization_i32_cl", group_normalization_i32_cl},
     {"group_normalization_u8_cl", group_normalization_u8_cl},
@@ -61682,6 +66159,7 @@ static const source_map_t cl_resource[] =
     {"instance_normalization_f32_cl", instance_normalization_f32_cl},
     {"instance_normalization_i32_cl", instance_normalization_i32_cl},
     {"instance_normalization_u8_cl", instance_normalization_u8_cl},
+    {"l1norm_cl", l1norm_cl},
     {"l2normalizescale_axis0_cl", l2normalizescale_axis0_cl},
     {"l2normalizescale_axis1_cl", l2normalizescale_axis1_cl},
     {"layer_normalization_cl", layer_normalization_cl},
@@ -61718,6 +66196,7 @@ static const source_map_t cl_resource[] =
     {"maximum_cl", maximum_cl},
     {"maxpoolwithargmax_cl", maxpoolwithargmax_cl},
     {"maxpoolwithargmax_2d_cl", maxpoolwithargmax_2d_cl},
+    {"maxunpool_cl", maxunpool_cl},
     {"minimum_cl", minimum_cl},
     {"mod_cl", mod_cl},
     {"moments_axis0_cl", moments_axis0_cl},
@@ -61752,6 +66231,7 @@ static const source_map_t cl_resource[] =
     {"resize_1d_nearest_cl", resize_1d_nearest_cl},
     {"resize_bilinear_cl", resize_bilinear_cl},
     {"resize_nearest_cl", resize_nearest_cl},
+    {"reversesequence_cl", reversesequence_cl},
     {"roi_align_cl", roi_align_cl},
     {"scatter_elements_cl", scatter_elements_cl},
     {"scatter_elements_add_cl", scatter_elements_add_cl},
@@ -61767,6 +66247,7 @@ static const source_map_t cl_resource[] =
     {"tile_cl", tile_cl},
     {"topk_cl", topk_cl},
     {"topk_odd_even_sort_cl", topk_odd_even_sort_cl},
+    {"topk_odd_even_sort2_cl", topk_odd_even_sort2_cl},
     {"upsample_cl", upsample_cl},
 };
 
diff --git a/src/tim/vx/internal/src/makefile.linux b/src/tim/vx/internal/src/makefile.linux
index e06adcf..eace064 100644
--- a/src/tim/vx/internal/src/makefile.linux
+++ b/src/tim/vx/internal/src/makefile.linux
@@ -32,7 +32,6 @@ OBJECTS +=   $(OBJ_DIR)/vsi_nn_code_generator.o   \
              $(OBJ_DIR)/vsi_nn_shape_util.o   \
              $(OBJ_DIR)/vsi_nn_dtype.o   \
              $(OBJ_DIR)/vsi_nn_limits.o   \
-             $(OBJ_DIR)/vsi_nn_vdata.o   \
              $(OBJ_DIR)/vsi_nn_util.o    \
              $(OBJ_DIR)/vsi_nn_dlfcn.o    \
              $(OBJ_DIR)/vsi_nn_constraint_check.o    \
@@ -44,9 +43,6 @@ OBJECTS +=   $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o   \
              $(OBJ_DIR)/vsi_nn_asymmetric_affine.o   \
              $(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o
 
-vpath %.c pycc
-OBJECTS +=      $(OBJ_DIR)/vsi_pycc_interface.o
-
 vpath %.c post
 OBJECTS +=      $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \
                 $(OBJ_DIR)/vsi_nn_post_cmupose.o
@@ -115,7 +111,8 @@ INCLUDES=-I. -I$(VIVANTE_SDK_DIR)/include/ \
  -I$(VIVANTE_SDK_DIR)/include/VX \
  -I../include/ops -I../include/utils -I../include/inference \
  -I../include/client -I../include -I../include/libnnext \
- -I../include/cpu_backend
+ -I../include/cpu_backend \
+ -I../src
 
 ifeq (1,$(DEBUG))
 CFLAGS+=-g
@@ -202,6 +199,7 @@ INCLUDE += -I$(VIVANTE_SDK_INC) -I$(VIVANTE_SDK_INC)/HAL -I$(AQROOT)/sdk/inc
 INCLUDE += -I../include/ops -I../include/utils -I../include/inference
 INCLUDE += -I../include/client -I../include -I../include/libnnext
 INCLUDE += -I../include/cpu_backend
+INCLUDE += -I../src
 
 CFLAGS += $(INCLUDE)
 CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
index 46e689c..078d708 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c
@@ -33,6 +33,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_kernel_prv.h"
 
 static int32_t _get_input_num
     (
@@ -126,11 +127,15 @@ static vsi_bool op_setup
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_internal_tensor_t* temp_output_tensor = NULL;
+    vsi_bool is_sp_supported = vx_false_e;
     uint32_t input_num = 0;
 
     vsi_nn_internal_init_node_wksp( self );
 
     input_num = _get_input_num(self, inputs);
+
+    is_sp_supported = vsi_nn_is_sp_supported_broadcast(self->graph, inputs, input_num, outputs[0]);
+
     for(i = 0; i < input_num -1; i++)
     {
         /* loop call add for input_num -1 times */
@@ -148,16 +153,18 @@ static vsi_bool op_setup
         curr->inputs[1] = inputs[i+1];
 
         /* setup output for each add */
-        if(i < input_num - 2)
+        if (i < input_num - 2)
         {
             memset(&attr, 0, sizeof(attr));
             attr.dim_num = VSI_NN_DIM_AUTO;
             attr.vtl = TRUE;
             attr.is_const = FALSE;
-            if (VSI_NN_TYPE_INT32 == outputs[0]->attr.dtype.vx_type){
+            if (VSI_NN_TYPE_INT32 == outputs[0]->attr.dtype.vx_type)
+            {
                 attr.dtype.vx_type = VSI_NN_TYPE_INT32;
             }
-            else if(_is_float32_data_format( self, inputs, outputs ))
+            else if ( _is_float32_data_format( self, inputs, outputs ) ||
+                      is_sp_supported )
             {
                 attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
             }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
index 0b76c95..d75a10a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
@@ -120,12 +120,14 @@ static vsi_bool _argmaxmin_op_setup
     )
 {
     int32_t axis = 0;
+    vsi_bool keep_dims = FALSE;
     vsi_bool ret = TRUE;
 
     if (strcmp(kernel_name, "argmax") == 0)
     {
         vsi_nn_argmax_param * p = &(self->nn_param.argmax);
         axis = p->axis;
+        keep_dims = p->keep_dims;
 
         if (axis < 0)
         {
@@ -137,6 +139,7 @@ static vsi_bool _argmaxmin_op_setup
     {
         vsi_nn_argmin_param * p = &(self->nn_param.argmin);
         axis = p->axis;
+        keep_dims = p->keep_dims;
 
         if (axis < 0)
         {
@@ -145,19 +148,31 @@ static vsi_bool _argmaxmin_op_setup
         }
     }
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         uint32_t i = 0;
-        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - 1;
+        uint32_t i_rank = inputs[0]->attr.dim_num;
+        uint32_t o_rank = keep_dims ? i_rank : i_rank - 1;
+        int8_t   is_scalar = o_rank == 0;
 
-        for (i = 0; i < (uint32_t)axis; i++)
+        outputs[0]->attr.dim_num = is_scalar ? 1 : o_rank;
+        vsi_nn_SetTensorIsScalar(outputs[0], is_scalar);
+
+        for (i = 0; i < inputs[0]->attr.dim_num; i++)
         {
-            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+            outputs[0]->attr.size[i] = is_scalar ? 1 : inputs[0]->attr.size[i];
         }
 
-        for (i = axis; i < outputs[0]->attr.dim_num; i++)
+        if (keep_dims)
         {
-            outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1];
+            outputs[0]->attr.size[(uint32_t)axis] = 1;
+        }
+        else
+        {
+            for (i = axis; i < outputs[0]->attr.dim_num; i++)
+            {
+                outputs[0]->attr.size[i] = is_scalar ? 1 : inputs[0]->attr.size[i + 1];
+            }
         }
 
         if (inputs[0]->attr.dim_num == 1)
@@ -179,18 +194,17 @@ static vsi_status _argmaxmin_op_init
 {
     vsi_status status = VSI_SUCCESS;
 
-    if (vsi_nn_compareVersion(self->graph, 1, 1, 11) == -1)
+    if (strcmp(kernel_name, "argmax") == 0)
     {
-        if (strcmp(kernel_name, "argmax") == 0)
-        {
-            vsi_nn_argmax_param * p = &(self->nn_param.argmax);
-            p->axis = 2;
-        }
-        else
-        {
-            vsi_nn_argmin_param * p = &(self->nn_param.argmin);
-            p->axis = 2;
-        }
+        vsi_nn_argmax_param* p = &(self->nn_param.argmax);
+        p->axis = 2;
+        p->keep_dims = FALSE;
+    }
+    else
+    {
+        vsi_nn_argmin_param* p = &(self->nn_param.argmin);
+        p->axis = 2;
+        p->keep_dims = FALSE;
     }
 
     return status;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_avg_pool3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_avg_pool3d.c
new file mode 100644
index 0000000..3ca90a6
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_avg_pool3d.c
@@ -0,0 +1,284 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _avg_pool3d_local_data_t {
+    int32_t placeholder;
+} avg_pool3d_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    uint32_t i = 0;
+    uint32_t new_rank = 0;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    int32_t ksize_x    = (int32_t)self->nn_param.avg_pool3d.ksize[0];
+    int32_t ksize_y    = (int32_t)self->nn_param.avg_pool3d.ksize[1];
+    int32_t ksize_z    = (int32_t)self->nn_param.avg_pool3d.ksize[2];
+    int32_t pad_left   = (int32_t)self->nn_param.avg_pool3d.pad[0];
+    int32_t pad_right  = (int32_t)self->nn_param.avg_pool3d.pad[1];
+    int32_t pad_top    = (int32_t)self->nn_param.avg_pool3d.pad[2];
+    int32_t pad_bottom = (int32_t)self->nn_param.avg_pool3d.pad[3];
+    int32_t pad_front  = (int32_t)self->nn_param.avg_pool3d.pad[4];
+    int32_t pad_end    = (int32_t)self->nn_param.avg_pool3d.pad[5];
+    int32_t stride_x   = (int32_t)self->nn_param.avg_pool3d.stride[0];
+    int32_t stride_y   = (int32_t)self->nn_param.avg_pool3d.stride[1];
+    int32_t stride_z   = (int32_t)self->nn_param.avg_pool3d.stride[2];
+    int32_t depth_in   = (int32_t)inputs[0]->attr.size[2];
+    int32_t depth_out  = (int32_t)outputs[0]->attr.size[2];
+    int32_t count_include_pad = (int32_t)self->nn_param.avg_pool3d.count_include_pad;
+    new_rank = 3;
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1];
+    shapes[0][2] = inputs[0]->attr.size[2];
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    shapes[1][2] = outputs[0]->attr.size[2];
+
+    for (i = 3; i < inputs[0]->attr.dim_num; i++)
+    {
+        shapes[0][2] = shapes[0][2] * inputs[0]->attr.size[i];
+        shapes[1][2] = shapes[1][2] * outputs[0]->attr.size[i];
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], shapes[1], new_rank );
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32(param, "ksize_x", ksize_x);
+    vsi_nn_kernel_param_add_int32(param, "ksize_y", ksize_y);
+    vsi_nn_kernel_param_add_int32(param, "ksize_z", ksize_z);
+    vsi_nn_kernel_param_add_int32(param, "pad_left", pad_left);
+    vsi_nn_kernel_param_add_int32(param, "pad_right", pad_right);
+    vsi_nn_kernel_param_add_int32(param, "pad_top", pad_top);
+    vsi_nn_kernel_param_add_int32(param, "pad_bottom", pad_bottom);
+    vsi_nn_kernel_param_add_int32(param, "pad_front", pad_front);
+    vsi_nn_kernel_param_add_int32(param, "pad_end", pad_end);
+    vsi_nn_kernel_param_add_int32(param, "stride_x", stride_x);
+    vsi_nn_kernel_param_add_int32(param, "stride_y", stride_y);
+    vsi_nn_kernel_param_add_int32(param, "stride_z", stride_z);
+    vsi_nn_kernel_param_add_int32(param, "depth_in", depth_in);
+    vsi_nn_kernel_param_add_int32(param, "depth_out", depth_out);
+    vsi_nn_kernel_param_add_int32(param, "count_include_pad", count_include_pad);
+
+    self->n = (vx_node)vsi_nn_kernel_selector(self->graph,"avg_pool3d",
+        &reshape_tensors[0],_INPUT_NUM,&reshape_tensors[1],_OUTPUT_NUM,param);
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release(&param);
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(AVG_POOL3D, 1, 1)
+        IO_TYPE(D_F32,   D_F32)
+        IO_TYPE(D_F16,   D_F16)
+        IO_TYPE(D_BF16, D_BF16)
+        IO_TYPE(D_I16|Q_SYM,   D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,    D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,    D_I8|Q_DFP)
+        IO_TYPE(D_F32,   D_I16|Q_SYM)
+        IO_TYPE(D_F16,   D_I16|Q_SYM)
+        IO_TYPE(D_F32,   D_I16|Q_DFP)
+        IO_TYPE(D_F16,   D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,   D_F32)
+        IO_TYPE(D_U8|Q_ASYM,   D_F32)
+        IO_TYPE(D_I8|Q_SYM,    D_F32)
+        IO_TYPE(D_I16|Q_DFP,   D_F32)
+        IO_TYPE(D_I8|Q_DFP,    D_F32)
+        IO_TYPE(D_F32,   D_U8|Q_ASYM)
+        IO_TYPE(D_F16,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,   D_F16)
+        IO_TYPE(D_U8|Q_ASYM,   D_F16)
+        IO_TYPE(D_I8|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,   D_F16)
+        IO_TYPE(D_I8|Q_DFP,    D_F16)
+        IO_TYPE(D_F32,   D_I8|Q_SYM)
+        IO_TYPE(D_F16,   D_I8|Q_SYM)
+        IO_TYPE(D_F32,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,   D_I8|Q_DFP)
+    END_IO_TYPE_DECL(AVG_POOL3D)
+
+    if (!VALIDATE_OP_IO_TYPES(
+            AVG_POOL3D, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+    return TRUE;
+} /* op_check() */
+
+
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_size_t ksize[_cnt_of_array(self->nn_param.avg_pool3d.ksize)] = {0};
+    vsi_size_t i = 0;
+    vsi_size_t pad[_cnt_of_array(self->nn_param.avg_pool3d.pad)] = {0};
+
+    for (i = 0; i < _cnt_of_array(self->nn_param.avg_pool3d.ksize); i++)
+    {
+        ksize[i] = self->nn_param.avg_pool3d.ksize[i];
+    }
+    for (i = 0; i < _cnt_of_array(self->nn_param.avg_pool3d.pad); i++)
+    {
+        pad[i] = self->nn_param.avg_pool3d.pad[i];
+    }
+
+    vsi_nn_compute_padding_3d(
+        inputs[0]->attr.size,
+        ksize,
+        self->nn_param.avg_pool3d.stride,
+        NULL,
+        self->nn_param.avg_pool3d.pad_type,
+        pad
+    );
+    for (i = 0; i < _cnt_of_array(self->nn_param.avg_pool3d.ksize); i++)
+    {
+        self->nn_param.avg_pool3d.ksize[i] = (uint32_t)ksize[i];
+    }
+    for (i = 0; i < _cnt_of_array(self->nn_param.avg_pool3d.pad); i++)
+    {
+        self->nn_param.avg_pool3d.pad[i] = (uint32_t)pad[i];
+    }
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[0],
+            self->nn_param.avg_pool3d.ksize[0],
+            &self->nn_param.avg_pool3d.pad[0],
+            self->nn_param.avg_pool3d.stride[0],
+            0,
+            self->nn_param.avg_pool3d.round_type
+            );
+        outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[1],
+            self->nn_param.avg_pool3d.ksize[1],
+            &self->nn_param.avg_pool3d.pad[2],
+            self->nn_param.avg_pool3d.stride[1],
+            0,
+            self->nn_param.avg_pool3d.round_type
+            );
+        outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[2],
+            self->nn_param.avg_pool3d.ksize[2],
+            &self->nn_param.avg_pool3d.pad[4],
+            self->nn_param.avg_pool3d.stride[2],
+            0,
+            self->nn_param.avg_pool3d.round_type
+            );
+        for (i = 3; i < outputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+    return TRUE;
+} /* op_setup() */
+
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ AVG_POOL3D,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
index b0eea1f..d1ca746 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
@@ -83,7 +83,7 @@ static vsi_status _try_set_high_presision_tensor
     return status;
 }
 
-static vsi_bool _is_3d_batchnorm
+static vsi_bool _require_reshape
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** inputs
@@ -96,17 +96,7 @@ static vsi_bool _is_3d_batchnorm
     {
         return FALSE;
     }
-    else
-    {
-        if ( 3 == inputs[0]->attr.dim_num )
-        {
-            return TRUE;
-        }
-        else
-        {
-            return FALSE;
-        }
-    }
+    return (3 == inputs[0]->attr.dim_num)||(5 == inputs[0]->attr.dim_num);
 }
 
 static vsi_bool _is_dynamic_batchnorm
@@ -141,7 +131,7 @@ static vsi_status _static_batchnorm
         VSILOGE("Set tensor attr of high presision fail");
         return status;
     }
-    if(_is_3d_batchnorm(self, inputs))
+    if(_require_reshape(self, inputs))
     {
         reshape_tensors[0] = self->nn_param.batch_norm.local->reshaped_input;
         reshape_tensors[5] = self->nn_param.batch_norm.local->reshaped_output;
@@ -175,6 +165,39 @@ static vsi_status _static_batchnorm
     return status;
 }
 
+static void _expand_dims_for_param
+    (
+    vsi_nn_tensor_attr_t* attr,
+    uint32_t dim_num,
+    vsi_size_t* shapes_expand
+    )
+{
+    uint32_t i = 0;
+
+    if (attr->dim_num == 1 && dim_num > 2)
+    {
+        /* [C] reshape to [1, 1, C, 1] */
+        for (i = 0; i < dim_num; i++)
+        {
+            if (i == dim_num - 2)
+            {
+                shapes_expand[i] = attr->size[0];
+            }
+            else
+            {
+                shapes_expand[i] = 1;
+            }
+        }
+    }
+    else
+    {
+        for (i = 0; i < attr->dim_num; i++)
+        {
+            shapes_expand[i] = attr->size[i];
+        }
+    }
+}
+
 static vsi_status _dynamic_batchnorm
     (
     vsi_nn_node_t * self,
@@ -185,7 +208,8 @@ static vsi_status _dynamic_batchnorm
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_size_t  shapes[4][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
-    vsi_size_t* shapes_ptr[4] = {NULL};
+    vsi_size_t  shapes_expand[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
+    vsi_size_t  *shapes_ptr[4] = {NULL};
     vsi_size_t  *shapes_in[3] = {NULL};
     vsi_size_t rank_in[3] = {0};
     uint32_t new_rank = 0;
@@ -197,11 +221,16 @@ static vsi_status _dynamic_batchnorm
     vsi_nn_kernel_param_add_float32( param, "eps", self->nn_param.batch_norm.eps );
 
     rank_in[0] = (vsi_size_t)inputs[0]->attr.dim_num;
-    rank_in[1] = (vsi_size_t)inputs[1]->attr.dim_num;
-    rank_in[2] = (vsi_size_t)inputs[3]->attr.dim_num;
+    rank_in[1] = (vsi_size_t)inputs[0]->attr.dim_num;
+    rank_in[2] = (vsi_size_t)inputs[0]->attr.dim_num;
+
+    /* [C] reshape to [1, 1, C, 1] if need */
+    _expand_dims_for_param(&(inputs[1]->attr), inputs[0]->attr.dim_num, shapes_expand[0]);
+    _expand_dims_for_param(&(inputs[3]->attr), inputs[0]->attr.dim_num, shapes_expand[1]);
+
     shapes_in[0] = inputs[0]->attr.size;
-    shapes_in[1] = inputs[1]->attr.size;
-    shapes_in[2] = inputs[3]->attr.size;
+    shapes_in[1] = shapes_expand[0];
+    shapes_in[2] = shapes_expand[1];
     for (i = 0; i < 4; i++)
     {
         shapes_ptr[i] = shapes[i];
@@ -298,7 +327,7 @@ static vsi_status op_optimize
     char tensor_name[128];
 
     dim = inputs[0]->attr.dim_num;
-    if(_is_3d_batchnorm(self, inputs) == FALSE)
+    if(_require_reshape(self, inputs) == FALSE)
     {
         return VSI_SUCCESS;
     }
@@ -308,11 +337,21 @@ static vsi_status op_optimize
         reshape 3d input (xcn) --> 4d input (whcn)
         reshape 3d output(xcn) --> 4d output(whcn)
     */
-    shape[0] = inputs[0]->attr.size[0];
-    shape[1] = 1;
-    shape[2] = inputs[0]->attr.size[1];
-    shape[3] = inputs[0]->attr.size[2];
     dim = 4;
+    if (3 == inputs[0]->attr.dim_num)
+    {
+        shape[0] = inputs[0]->attr.size[0];
+        shape[1] = 1;
+        shape[2] = inputs[0]->attr.size[1];
+        shape[3] = inputs[0]->attr.size[2];
+    }
+    else if (5 == inputs[0]->attr.dim_num)
+    {
+        shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
+        shape[1] = inputs[0]->attr.size[2];
+        shape[2] = inputs[0]->attr.size[3];
+        shape[3] = inputs[0]->attr.size[4];
+    }
     local = self->nn_param.batch_norm.local;
     if (VSI_NN_OPTIMIZE_BACKWARD == direction)
     {
@@ -470,7 +509,7 @@ static vsi_bool op_setup
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
     }
 
-    if(_is_3d_batchnorm(self, inputs))
+    if(_require_reshape(self, inputs))
     {
         local = (vsi_nn_batcnnorm_lcl_data *)malloc(sizeof(vsi_nn_batcnnorm_lcl_data));
         if(NULL == local)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
index 19bc82f..c122de7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
@@ -410,7 +410,6 @@ static vsi_bool op_setup
         vsi_nn_tensor_t* rnncell_out1 = NULL;
 
         /* rnncell output */
-
         if(curr_param->merge_outputs)
         {
             vsi_nn_internal_init_tensor_attr(&attr,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c
new file mode 100644
index 0000000..c664a3c
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bilinear_grid_sample.c
@@ -0,0 +1,161 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _bilinear_grid_sample_local_data_t {
+    int32_t placeholder;
+} bilinear_grid_sample_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+
+    vsi_nn_kernel_param_t* param = NULL;
+    int32_t align_corners = self->nn_param.bilinear_grid_sample.align_corners;
+    vsi_nn_kernel_node_t n;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
+    n = vsi_nn_kernel_selector(
+        self->graph, "bilinear_grid_sample", inputs, 2, outputs, 1, param);
+    if (n == NULL) {
+        vsi_nn_kernel_param_release(&param);
+        status = VSI_FAILURE;
+        return status;
+    }
+    self->n = (vx_node)n;
+    vsi_nn_kernel_param_release(&param);
+    if (self->n) {
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if (NULL == self) {
+        return FALSE;
+    }
+
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = inputs[1]->attr.size[1];
+        outputs[0]->attr.size[1] = inputs[1]->attr.size[2];
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+        if (4 == inputs[0]->attr.dim_num) {
+            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    /* TODO
+    //self->nn_param.bilinear_grid_sample.local = \
+    //    (bilinear_grid_sample_local_data_t*)malloc(sizeof(bilinear_grid_sample_local_data_t));
+    */
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    /* TODO
+    //vsi_nn_safe_free(self->nn_param.bilinear_grid_sample.local);
+    */
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ BILINEAR_GRID_SAMPLE,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
index 4f55660..3e1db0e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@@ -37,7 +37,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "vsi_nn_internal_node.h"
-#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "utils/vsi_nn_dtype_util.h"
 
@@ -62,46 +61,17 @@ static vsi_status op_compute
     }
     else
     {
-        vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
-        vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
-        vsi_size_t new_rank = 0;
-        vsi_bool ret = TRUE;
         vsi_nn_kernel_param_t * param = NULL;
 
         param = vsi_nn_kernel_param_create();
 
-        if ( vsi_nn_TypeGetBits(inputs[0]->attr.dtype.vx_type) == 4 ||
-             vsi_nn_TypeGetBits(outputs[0]->attr.dtype.vx_type) == 4 )
-        {
-            new_rank = inputs[0]->attr.dim_num;
-            memcpy(shape, inputs[0]->attr.size, sizeof(inputs[0]->attr.size));
-        }
-        else
-        {
-            ret = vsi_nn_kernel_optimize_element_shape(
-                    inputs[0]->attr.size, inputs[0]->attr.dim_num,
-                    shape, &new_rank );
-        }
-
-
         vsi_nn_kernel_param_add_float32( param, "min_value",  min_value );
         vsi_nn_kernel_param_add_float32( param, "max_value",  max_value );
 
-        if ( ret )
-        {
-            reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
-                    inputs[0], shape, new_rank );
-            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
-                    outputs[0], shape, new_rank );
-
-            self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-                    "clip",
-                    &reshape_tensors[0], 1,
-                    &reshape_tensors[1], 1, param );
-
-            vsi_safe_release_tensor( reshape_tensors[0] );
-            vsi_safe_release_tensor( reshape_tensors[1] );
-        }
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+                "clip",
+                inputs, 1,
+                outputs, 1, param );
 
         if ( self->n )
         {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c
index 1315bd7..354b6ce 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c
@@ -29,6 +29,7 @@
 #include "vsi_nn_node.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
 
 vsi_status vsi_nn_op_common_compute
     (
@@ -63,9 +64,10 @@ vsi_bool vsi_nn_op_common_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        vsi_nn_SetTensorIsScalar(outputs[0], vsi_nn_GetTensorIsScalar(inputs[0]));
         memcpy( outputs[0]->attr.size, inputs[0]->attr.size,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
index 8b8f058..bb1be6e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
@@ -59,6 +59,24 @@ static int32_t _get_input_num
     return num;
 }
 
+static vsi_bool _has_norm_input
+    (
+    vsi_nn_node_t   * self,
+    vsi_nn_tensor_t ** inputs
+    )
+{
+    uint32_t i,num;
+    num = _get_input_num(self, inputs);
+    for(i = 0; i < num; i++)
+    {
+        if(inputs[i]->attr.vtl == FALSE && inputs[i]->attr.is_const == FALSE)
+        {
+            return TRUE;
+        }
+    }
+    return FALSE;
+} /* _has_norm_input() */
+
 static vsi_bool _is_same_quant
     (
     vsi_nn_node_t   * self,
@@ -99,21 +117,25 @@ static vsi_bool _is_same_quant
     return TRUE;
 } /* _is_same_quant */
 
-static vsi_bool _is_highest_dimension
+static vsi_bool _is_tensorview_support
     (
     vsi_nn_node_t   * self,
     vsi_nn_tensor_t ** outputs
     )
 {
     vsi_bool ret = FALSE;
-    uint32_t axis = self->nn_param.concat.axis;
-    uint32_t dim = outputs[0]->attr.dim_num;
 
+#ifdef VSI_CONCAT_ENHANCE_SUPPORT
+    // Driver support concat optimize in all dimensions.
+    ret = TRUE;
+#else
     /*
         If the concat op need to be optimized to tensor view, the memory must be continues.
         1. axis is in the highest dimension
         2. the highest dimension is 1, and axis is in the second highest dimension
     */
+    uint32_t axis = self->nn_param.concat.axis;
+    uint32_t dim = outputs[0]->attr.dim_num;
     if(axis == dim - 1)
     {
         ret = TRUE;
@@ -122,8 +144,9 @@ static vsi_bool _is_highest_dimension
     {
         ret = TRUE;
     }
+#endif
     return ret;
-} /* _is_highest_dimension() */
+} /* _is_tensorview_support() */
 
 static vsi_status copy_tensor_to_view
     (
@@ -244,7 +267,9 @@ static vsi_status op_compute
 
     status = VSI_SUCCESS;
     self->n = NULL;
-    if(_is_highest_dimension(self, outputs) && _is_same_quant(self, inputs, outputs)
+    if(_is_tensorview_support(self, outputs)
+        && _is_same_quant(self, inputs, outputs)
+        && (_has_norm_input(self, inputs) == FALSE)
         && self->graph->ctx->options.enable_concat_optimize)
     {
         iter = self->nn_param.concat.lcl_data;
@@ -398,8 +423,9 @@ static vsi_status op_optimize
 
     status = VSI_SUCCESS;
     /* we don't create tensor view if the axis is not the highest dimension */
-    if (_is_highest_dimension(self, outputs) == FALSE ||
+    if (_is_tensorview_support(self, outputs) == FALSE ||
         _is_same_quant(self, inputs, outputs) == FALSE ||
+        _has_norm_input(self, inputs) == TRUE ||
         self->graph->ctx->options.enable_concat_optimize == 0)
     {
         return status;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
index 3b2cf21..f07a690 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
@@ -170,174 +170,89 @@ static vsi_bool op_check
 {
     vsi_bool ret = FALSE;
 
-    BEGIN_IO_TYPE_DECL(CONV1D, 3, 1)
-        IO_TYPE(D_F16,  D_F16,  D_NONE, D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F32, D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F16, D_F16)
-        IO_TYPE(D_F32,  D_F32,  D_F32, D_F32)
-        IO_TYPE(D_F32,  D_F32,  D_F32, D_BF16)
-        IO_TYPE(D_F32,  D_F32,  D_NONE, D_F32)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_NONE, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I32|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I64|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_NONE, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_BF16,  D_BF16,  D_F32, D_BF16)
-        IO_TYPE(D_BF16,  D_BF16,  D_F32, D_F32)
-        IO_TYPE(D_BF16,  D_BF16,  D_NONE, D_BF16)
+    BEGIN_IO_TYPE_DECL(CONV1D, 2, 0)
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,         D_BF16)
 
         /* HW 9.0.1 */
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_SYM_PC)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_SYM_PC)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_SYM_PC)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_SYM_PC)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_F32,          D_BF16)
 
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I4|Q_DFP,     D_I8|Q_DFP)
 
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
+        IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I4|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I4|Q_SYM,     D_U8|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I8|Q_SYM_PC)
 
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_ASYM)
 
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
 
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_F32)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_F32)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_F32)
+        IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_SYM_PC)
+        IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_SYM_PC)
+        IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM_PC)
 
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_F32)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_BF16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_F32)
-
-        IO_TYPE(D_F16,       D_F16,          D_NONE,          D_BF16)
-        IO_TYPE(D_F16,       D_F16,          D_NONE,          D_F32)
-        IO_TYPE(D_F16,       D_F16,          D_F32,           D_BF16)
-        IO_TYPE(D_F16,       D_F16,          D_F32,           D_F32)
-
-        IO_TYPE(D_BF16,      D_BF16,         D_NONE,          D_F16)
-        IO_TYPE(D_BF16,      D_BF16,         D_F32,           D_F16)
-
-        IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_F16)
-        IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_BF16)
-        IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_F32)
-        IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F16)
-        IO_TYPE(D_F32,       D_BF16,         D_F32,           D_BF16)
-        IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F32)
+        /* HW 9.2 */
+        IO_TYPE(D_U16|Q_ASYM,   D_U16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_U16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_U16|Q_ASYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_U16|Q_ASYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U16|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I16|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I8|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I8|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I4|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I4|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_U4|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_U4|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I8|Q_SYM_PC)
+        IO_TYPE(D_U16|Q_ASYM,   D_U8|Q_SYM_PC)
+        IO_TYPE(D_U16|Q_ASYM,   D_I16|Q_SYM_PC)
 
     END_IO_TYPE_DECL(CONV1D)
-    if (!VALIDATE_OP_IO_TYPES(CONV1D, self, inputs, self->input.num, outputs, self->output.num))
+    if (!VALIDATE_OP_IO_TYPES(CONV1D, self, inputs, 2, outputs, 0))
     {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
index 228b586..c82f15f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
@@ -89,29 +89,22 @@ static vsi_bool op_check
     ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]);
 
     if (ret) {
-        vsi_size_t kx = 1;
-        vsi_size_t ky = 1;
         /* check inputs outputs data type */
         BEGIN_IO_TYPE_DECL(CONV2D, 2, 0)
             /* IO_TYPE(INPUT, WEIGHT) */
-            IO_TYPE(D_F32, D_F32)
-            IO_TYPE(D_F16, D_F16)
+            IO_TYPE(D_F32,          D_F32)
+            IO_TYPE(D_F16,          D_F16)
 
             IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
-
             IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
             IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
             IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
             IO_TYPE(D_BF16,         D_BF16)
             IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_SYM_PC)
-            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
             IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_SYM_PC)
             IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_SYM_PC)
 
-            /* HW 9.0 */
-            IO_TYPE(D_F32, D_BF16)
             /* HW 9.0.1 */
-            IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
             IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM)
             IO_TYPE(D_I8|Q_ASYM,    D_U8|Q_ASYM)
             IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
@@ -163,6 +156,30 @@ static vsi_bool op_check
             IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM_PC)
             IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM_PC)
 
+            /* HW 9.2 */
+            IO_TYPE(D_U16|Q_ASYM,   D_U16|Q_ASYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_U16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,    D_U16|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_U16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_U16|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_U16|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_U16|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,     D_U16|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_U16|Q_ASYM)
+            IO_TYPE(D_U4|Q_SYM,     D_U16|Q_ASYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_I16|Q_ASYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_I16|Q_SYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_U8|Q_ASYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_I8|Q_ASYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_I8|Q_SYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_I4|Q_ASYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_I4|Q_SYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_U4|Q_ASYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_U4|Q_SYM)
+            IO_TYPE(D_U16|Q_ASYM,   D_I8|Q_SYM_PC)
+            IO_TYPE(D_U16|Q_ASYM,   D_U8|Q_SYM_PC)
+            IO_TYPE(D_U16|Q_ASYM,   D_I16|Q_SYM_PC)
+
         END_IO_TYPE_DECL(CONV2D)
         ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, 2, outputs, 0);
         if (!ret) {
@@ -173,13 +190,6 @@ static vsi_bool op_check
             return FALSE;
         }
 
-        /* check parameters */
-        kx = inputs[1]->attr.size[0];
-        ky = inputs[1]->attr.dim_num == 3 ? 1 : inputs[1]->attr.size[1];
-        if (kx * ky > 6400) {
-            VSILOGE("Kernel size should <= 6400.");
-            return FALSE;
-        }
     }
 
     return ret;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
index 5af07e2..388de95 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
@@ -82,7 +82,19 @@ static vsi_nn_internal_tensor_t * create_input_conv
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
 
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if( input->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 &&
+        input->attr.dtype.vx_type != VSI_NN_TYPE_BFLOAT16 )
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
+    else if (input->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_BFLOAT16;
+    }
+    else
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    }
     attr.dim_num = VSI_NN_DIM_AUTO;
     attr.vtl = TRUE;
     attr.is_const = FALSE;
@@ -147,7 +159,19 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv
     bias = internal_bias->t;
 
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if( input->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 &&
+        input->attr.dtype.vx_type != VSI_NN_TYPE_BFLOAT16 )
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
+    else if (input->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_BFLOAT16;
+    }
+    else
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    }
     attr.dim_num = VSI_NN_DIM_AUTO;
     attr.vtl = TRUE;
     attr.is_const = FALSE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
index 2b470b1..1825e3b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
@@ -258,12 +258,6 @@ static vsi_bool op_check
             destroy_op_io_types_desc(desc);
             return FALSE;
         }
-
-        /* check parameters */
-        if(inputs[1]->attr.size[0] * inputs[1]->attr.size[1] > 6400) {
-            VSILOGE("Kernel size should <= 6400.");
-            return FALSE;
-        }
     }
     return TRUE;
 } /* op_check() */
@@ -395,4 +389,4 @@ DEF_OP_REG
     /* output_num */ _OUTPUT_NUM
     );
 
-__END_DECLS
\ No newline at end of file
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c
index 48b43b5..aef5a68 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c
@@ -23,6 +23,7 @@
 *****************************************************************************/
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
@@ -171,7 +172,7 @@ static vsi_status op_optimize
 {
     vsi_status status;
     vsi_bool ret;
-    vsi_nn_tensor_t conv_out, *pconv_out;
+    vsi_nn_tensor_prv_t conv_out, *pconv_out;
     vx_nn_convolution_relu_pooling_params_ext2_t p;
     vx_weights_biases_parameter_optimizations_t opt;
     vx_weights_biases_parameter_optimizations_t * p_opt;
@@ -184,10 +185,10 @@ static vsi_status op_optimize
     }
 
     VSILOGD("Optimize %s", vsi_nn_OpGetName(self->op));
-    memset(&conv_out, 0, sizeof(vsi_nn_tensor_t));
+    memset(&conv_out, 0, sizeof(vsi_nn_tensor_prv_t));
     pconv_out = &conv_out;
 
-    ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, &pconv_out );
+    ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, (vsi_nn_tensor_t**)(&pconv_out) );
     if(ret == FALSE)
     {
         VSILOGE("OpSetup [VSI_NN_OP_CONV2D] fail\n");
@@ -223,7 +224,7 @@ static vsi_status op_optimize
             for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
             {
                 size_input0[i] = (vx_size)inputs[0]->attr.size[i];
-                size_pconv_out[i] = (vx_size)pconv_out->attr.size[i];
+                size_pconv_out[i] = (vx_size)pconv_out->pot.attr.size[i];
                 size_output0[i] = (vx_size)outputs[0]->attr.size[i];
             }
             inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
@@ -248,7 +249,7 @@ static vsi_status op_optimize
             for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
             {
                 size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i];
-                size_u32_pconv_out[i] = (uint32_t)pconv_out->attr.size[i];
+                size_u32_pconv_out[i] = (uint32_t)pconv_out->pot.attr.size[i];
                 size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i];
             }
             inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index 8d8ff5f..d1a7785 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -33,6 +33,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
 
 static vsi_status op_compute
     (
@@ -262,6 +263,8 @@ static vsi_bool op_check
         IO_TYPE(D_BF16,       D_F16)
         IO_TYPE(D_BF16,       D_F32)
         IO_TYPE(D_I32,        D_I32)
+        IO_TYPE(D_I32,        D_F32)
+        IO_TYPE(D_I32,        D_F16)
         IO_TYPE(D_I32,        D_I16|Q_DFP)
         IO_TYPE(D_I32,        D_I8|Q_DFP)
         IO_TYPE(D_I32,        D_U32)
@@ -283,6 +286,8 @@ static vsi_bool op_check
         IO_TYPE(D_I4|Q_SYM,   D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_I4|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_I4|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_U16|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM, D_U8|Q_ASYM)
 
         /* HW 9.0.1 */
         IO_TYPE(D_I8|Q_DFP,   D_BF16)
@@ -293,7 +298,6 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_BF16)
         IO_TYPE(D_I16|Q_DFP,  D_F32)
-        IO_TYPE(D_F16,        D_F32)
 
         /* HW 9.1.1 */
         IO_TYPE(D_U4|Q_ASYM,  D_I8|Q_ASYM)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index aea2f63..a7bc5d1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -37,6 +37,7 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
 
 vsi_bool vsi_nn_kernel_is_supported_types
     (
@@ -209,70 +210,76 @@ static vsi_bool op_check_minimum
     vsi_nn_tensor_t ** outputs
     )
 {
-    /* check inputs outputs data type */
-    BEGIN_IO_TYPE_DECL(MINIMUM, 2, 1)
-        IO_TYPE(D_F16,          D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
-        IO_TYPE(D_F32,          D_F32,          D_F32)
-        IO_TYPE(D_I32,          D_I32,          D_I32)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_SYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
-        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_ASYM,    D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_ASYM,   D_I16|Q_SYM)
-    END_IO_TYPE_DECL(MINIMUM)
-    if(!VALIDATE_OP_IO_TYPES(MINIMUM, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
+
+    if (!ret)
+    {
+        /* check inputs outputs data type */
+        BEGIN_IO_TYPE_DECL(MINIMUM, 2, 1)
+            IO_TYPE(D_F16,          D_F16,          D_F16)
+            IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+            IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+            IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+            IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
+            IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
+            IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
+            IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
+            IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
+            IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+            IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+            IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
+            IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+            IO_TYPE(D_F32,          D_F32,          D_F32)
+            IO_TYPE(D_I32,          D_I32,          D_I32)
+            IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_SYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
+            IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+            IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
+            IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
+            IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+            IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
+            IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
+            IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
+            IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
+            IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
+            IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
+            IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_ASYM,    D_I8|Q_SYM)
+            IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
+            IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
+            IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+            IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+            IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
+            IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
+            IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
+            IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I16|Q_ASYM,   D_I16|Q_SYM)
+        END_IO_TYPE_DECL(MINIMUM)
+        if (!VALIDATE_OP_IO_TYPES(MINIMUM, self, inputs, self->input.num, outputs, self->output.num))
+        {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
     }
 
     return TRUE;
@@ -285,73 +292,7 @@ static vsi_bool op_check_maximum
     vsi_nn_tensor_t ** outputs
     )
 {
-    /* check inputs outputs data type */
-    BEGIN_IO_TYPE_DECL(MAXIMUM, 2, 1)
-        IO_TYPE(D_F16,          D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
-        IO_TYPE(D_F32,          D_F32,          D_F32)
-        IO_TYPE(D_I32,          D_I32,          D_I32)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_SYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
-        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_ASYM,    D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_ASYM,   D_I16|Q_SYM)
-    END_IO_TYPE_DECL(MAXIMUM)
-    if(!VALIDATE_OP_IO_TYPES(MAXIMUM, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
-
-    return TRUE;
+    return vsi_nn_OpCheck(VSI_NN_OP_MINIMUM, self, inputs, outputs);
 } /* op_check() */
 
 static vsi_bool op_check_pow
@@ -437,145 +378,85 @@ static vsi_bool op_check_add
     vsi_nn_tensor_t ** outputs
     )
 {
-    /* check inputs outputs data type */
-    BEGIN_IO_TYPE_DECL(ADD, 2, 1)
-        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
-        IO_TYPE(D_F32,          D_F32,          D_BF16)
-        IO_TYPE(D_BF16,         D_BF16,         D_F32)
-        IO_TYPE(D_F16,          D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP,    D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_I32,          D_U8|Q_ASYM)
-        IO_TYPE(D_F32,          D_F32,          D_F32)
-        IO_TYPE(D_F32,          D_F32,          D_F16)
-        IO_TYPE(D_F32,          D_F16,          D_F32)
-        IO_TYPE(D_F32,          D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_F32,          D_F32)
-        IO_TYPE(D_F16,          D_F32,          D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_F32)
-        IO_TYPE(D_I32,          D_I32,          D_I32)
-        IO_TYPE(D_I16,          D_I32,          D_I32)
-        IO_TYPE(D_I32,          D_I16,          D_I32)
-        IO_TYPE(D_I32,          D_I32,          D_U8|Q_ASYM)
-        IO_TYPE(D_I32,          D_I32,          D_I16|Q_DFP)
-        IO_TYPE(D_I32,          D_I32,          D_I8|Q_DFP)
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
 
-        /* HW 9.0.1 */
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_BF16)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    F32)
+    if (!ret)
+    {
+        /* check inputs outputs data type */
+        BEGIN_IO_TYPE_DECL(ADD, 2, 0)
+            IO_TYPE(D_BF16,         D_BF16)
+            IO_TYPE(D_F32,          D_F32)
+            IO_TYPE(D_F16,          D_F16)
+            IO_TYPE(D_F16,          D_I16|Q_DFP)
+            IO_TYPE(D_F16,          D_I16|Q_ASYM)
+            IO_TYPE(D_F16,          D_I16|Q_SYM)
+            IO_TYPE(D_F16,          D_I8|Q_DFP)
+            IO_TYPE(D_F16,          D_I8|Q_ASYM)
+            IO_TYPE(D_F16,          D_I8|Q_SYM)
+            IO_TYPE(D_F16,          D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,    D_F16)
+            IO_TYPE(D_I16|Q_ASYM,   D_F16)
+            IO_TYPE(D_I16|Q_SYM,    D_F16)
+            IO_TYPE(D_I8|Q_DFP,     D_F16)
+            IO_TYPE(D_I8|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_SYM,     D_F16)
+            IO_TYPE(D_U8|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP)
+            IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I32)
+            IO_TYPE(D_F32,          D_F16)
+            IO_TYPE(D_F16,          D_F32)
+            IO_TYPE(D_I32,          D_I32)
+            IO_TYPE(D_I16,          D_I32)
+            IO_TYPE(D_I32,          D_I16)
 
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_BF16)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     F32)
+            /* HW 9.0.1 */
+            IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM)
+            IO_TYPE(D_F32,          D_BF16)
 
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_BF16)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     F32)
+            /* HW 9.1.1 */
+            IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I4|Q_ASYM)
 
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_BF16)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    F32)
+            IO_TYPE(D_U4|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_F32)
+            IO_TYPE(D_U4|Q_ASYM,    D_F32)
+            IO_TYPE(D_I4|Q_ASYM,    D_F32)
+            IO_TYPE(D_I4|Q_ASYM,    D_F32)
+            IO_TYPE(D_F32,          D_U4|Q_ASYM)
+            IO_TYPE(D_F32,          D_U4|Q_ASYM)
+            IO_TYPE(D_F32,          D_I4|Q_ASYM)
+            IO_TYPE(D_F32,          D_I4|Q_ASYM)
 
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_BF16)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    F32)
-
-        IO_TYPE(D_BF16,         D_BF16,         D_I8|Q_DFP)
-        IO_TYPE(D_BF16,         D_BF16,         D_U8|Q_ASYM)
-        IO_TYPE(D_BF16,         D_BF16,         D_I16|Q_DFP)
-        IO_TYPE(D_BF16,         D_BF16,         F16)
-        IO_TYPE(D_BF16,         D_BF16,         F32)
-
-        IO_TYPE(D_F16,          D_F16,          D_BF16)
-        IO_TYPE(D_F16,          D_F16,          F32)
-
-        IO_TYPE(D_F32,          D_BF16,         D_U8|Q_ASYM)
-        IO_TYPE(D_F32,          D_BF16,         D_I8|Q_DFP)
-        IO_TYPE(D_F32,          D_BF16,         D_I16|Q_DFP)
-        IO_TYPE(D_F32,          D_BF16,         D_F16)
-        IO_TYPE(D_F32,          D_BF16,         D_BF16)
-        IO_TYPE(D_F32,          D_BF16,         F32)
-
-        /* HW 9.1.1 */
-        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM,    D_U4|Q_ASYM)
-        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM,     D_U4|Q_SYM)
-        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM,    D_I4|Q_ASYM)
-        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM,     D_I4|Q_SYM)
-
-    END_IO_TYPE_DECL(ADD)
-    if (!VALIDATE_OP_IO_TYPES(ADD, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
+        END_IO_TYPE_DECL(ADD)
+        if (!VALIDATE_OP_IO_TYPES(ADD, self, inputs, self->input.num, outputs, 0)) {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, 0);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
     }
 
     return TRUE;
@@ -603,72 +484,77 @@ static vsi_bool op_check_div
     vsi_nn_tensor_t ** outputs
     )
 {
-    /* check inputs outputs data type */
-    BEGIN_IO_TYPE_DECL(DIVIDE, 2, 1)
-        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
-        IO_TYPE(D_F16,          D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
-        IO_TYPE(D_F32,          D_F32,          D_F32)
-        IO_TYPE(D_F32,          D_F32,          D_F16)
-        IO_TYPE(D_F32,          D_F16,          D_F32)
-        IO_TYPE(D_F32,          D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_F32,          D_F32)
-        IO_TYPE(D_F16,          D_F32,          D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_F32)
-        IO_TYPE(D_I32,          D_I32,          D_I32)
-        IO_TYPE(D_I16,          D_I32,          D_I32)
-        IO_TYPE(D_I32,          D_I16,          D_I32)
-    END_IO_TYPE_DECL(DIVIDE)
-    if (!VALIDATE_OP_IO_TYPES(DIVIDE, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
+
+    if (!ret)
+    {
+        /* check inputs outputs data type */
+        BEGIN_IO_TYPE_DECL(DIVIDE, 2, 1)
+            IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+            IO_TYPE(D_F16,          D_F16,          D_F16)
+            IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+            IO_TYPE(D_F16,          D_F16,          D_I16|Q_ASYM)
+            IO_TYPE(D_F16,          D_F16,          D_I16|Q_SYM)
+            IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+            IO_TYPE(D_F16,          D_F16,          D_I8|Q_ASYM)
+            IO_TYPE(D_F16,          D_F16,          D_I8|Q_SYM)
+            IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+            IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
+            IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
+            IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
+            IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
+            IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_F16)
+            IO_TYPE(D_F16,          D_I8|Q_SYM,     D_F16)
+            IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
+            IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
+            IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
+            IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
+            IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
+            IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_F16)
+            IO_TYPE(D_F16,          D_I16|Q_SYM,    D_F16)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+            IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
+            IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+            IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
+            IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
+            IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+            IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
+            IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
+            IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
+            IO_TYPE(D_F32,          D_F32,          D_F32)
+            IO_TYPE(D_F32,          D_F32,          D_F16)
+            IO_TYPE(D_F32,          D_F16,          D_F32)
+            IO_TYPE(D_F32,          D_F16,          D_F16)
+            IO_TYPE(D_F16,          D_F32,          D_F32)
+            IO_TYPE(D_F16,          D_F32,          D_F16)
+            IO_TYPE(D_F16,          D_F16,          D_F32)
+            IO_TYPE(D_I32,          D_I32,          D_I32)
+            IO_TYPE(D_I16,          D_I32,          D_I32)
+            IO_TYPE(D_I32,          D_I16,          D_I32)
+        END_IO_TYPE_DECL(DIVIDE)
+        if (!VALIDATE_OP_IO_TYPES(DIVIDE, self, inputs, self->input.num, outputs, self->output.num)) {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
     }
 
     return TRUE;
@@ -682,133 +568,11 @@ static vsi_bool op_check_mul
     )
 {
     /* check inputs outputs data type */
-    BEGIN_IO_TYPE_DECL(MULTIPLY, 2, 1)
-        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
-        IO_TYPE(D_F32,          D_F32,          D_BF16)
-        IO_TYPE(D_BF16,         D_BF16,         D_F32)
-        IO_TYPE(D_F16,          D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_F16,          D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I8|Q_SYM,     D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,    D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_I8|Q_DFP,     D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_SYM,     D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP,    D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_F32,          D_F32,          D_F32)
-        IO_TYPE(D_F32,          D_F32,          D_F16)
-        IO_TYPE(D_F32,          D_F16,          D_F32)
-        IO_TYPE(D_F32,          D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_F32,          D_F32)
-        IO_TYPE(D_F16,          D_F32,          D_F16)
-        IO_TYPE(D_F16,          D_F16,          D_F32)
-        IO_TYPE(D_I32,          D_I32,          D_I32)
-        IO_TYPE(D_I16,          D_I32,          D_I32)
-        IO_TYPE(D_I32,          D_I16,          D_I32)
+    vsi_bool ret = FALSE;
 
-        /* HW 9.0.1 */
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_BF16)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    F32)
+    ret = vsi_nn_OpCheck(VSI_NN_OP_ADD, self, inputs, outputs);
 
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     D_BF16)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP,     F32)
-
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_BF16)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     F32)
-
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    D_BF16)
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM,    F32)
-
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_BF16)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    F32)
-
-        IO_TYPE(D_BF16,         D_BF16,         D_I8|Q_DFP)
-        IO_TYPE(D_BF16,         D_BF16,         D_U8|Q_ASYM)
-        IO_TYPE(D_BF16,         D_BF16,         D_I16|Q_DFP)
-        IO_TYPE(D_BF16,         D_BF16,         F16)
-        IO_TYPE(D_BF16,         D_BF16,         F32)
-
-        IO_TYPE(D_F16,          D_F16,          D_BF16)
-        IO_TYPE(D_F16,          D_F16,          F32)
-
-        IO_TYPE(D_F32,          D_BF16,         D_U8|Q_ASYM)
-        IO_TYPE(D_F32,          D_BF16,         D_I8|Q_DFP)
-        IO_TYPE(D_F32,          D_BF16,         D_I16|Q_DFP)
-        IO_TYPE(D_F32,          D_BF16,         D_F16)
-        IO_TYPE(D_F32,          D_BF16,         D_BF16)
-        IO_TYPE(D_F32,          D_BF16,         F32)
-
-        /* HW 9.1.1 */
-        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM,    D_U4|Q_ASYM)
-        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM,     D_U4|Q_SYM)
-        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM,    D_I4|Q_ASYM)
-        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM,     D_I4|Q_SYM)
-
-    END_IO_TYPE_DECL(MULTIPLY)
-    if (!VALIDATE_OP_IO_TYPES(MULTIPLY, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
-
-    return TRUE;
+    return ret;
 } /* op_check() */
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
index 7dc29af..68c6993 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@@ -69,11 +69,15 @@ static vsi_status _eltwise_unary_op_compute
         alpha = self->nn_param.selu.alpha;
         beta = self->nn_param.selu.gamma;
     }
-    else
+    else if (strcmp(kernel_name, "hard_sigmoid") == 0)
     {
         alpha = self->nn_param.hard_sigmoid.alpha;
         beta = self->nn_param.hard_sigmoid.beta;
     }
+    else if (strcmp(kernel_name, "inverse_sigmoid") == 0)
+    {
+        alpha = self->nn_param.inverse_sigmoid.eps;
+    }
     vsi_nn_kernel_param_add_float32( param, "alpha", alpha );
     vsi_nn_kernel_param_add_float32( param, "beta", beta );
 
@@ -218,6 +222,10 @@ static vsi_status _eltwise_unary_op_init
         self->nn_param.selu.alpha = 1.67326319217681884765625f;
         self->nn_param.selu.gamma = 1.05070102214813232421875f;
     }
+    else if (strcmp(kernel_name, "inverse_sigmoid") == 0)
+    {
+        self->nn_param.inverse_sigmoid.eps = (float)1e-5;
+    }
 
     return VSI_SUCCESS;
 } /* op_init() */
@@ -261,6 +269,10 @@ DEF_ELEMENT_WISE_UNARY_OP( CELU, celu );
 DEF_ELEMENT_WISE_UNARY_OP( RCP,  rcp );
 DEF_ELEMENT_WISE_UNARY_OP( SIGN, sign );
 DEF_ELEMENT_WISE_UNARY_OP( SOFTSIGN, softsign );
+DEF_ELEMENT_WISE_UNARY_OP( ATAN, atan );
+DEF_ELEMENT_WISE_UNARY_OP( ATANH, atanh );
+DEF_ELEMENT_WISE_UNARY_OP( ACOSH, acosh );
+DEF_ELEMENT_WISE_UNARY_OP( INVERSE_SIGMOID, inverse_sigmoid );
 
 #undef DEF_ELEMENT_UNARY_WISE_OP
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
index 1a2a3aa..68c9fc2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
@@ -43,7 +43,7 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    return vsi_nn_internal_compute_node( self );;
+    return vsi_nn_internal_compute_node( self );
 }
 
 static vsi_bool op_check
@@ -118,9 +118,12 @@ static vsi_bool op_setup
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     attr.dim_num = p->dim_num;
-    if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) {
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE &&
+        (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
+        inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16)) {
         attr.dtype.vx_type = VSI_NN_TYPE_INT32;
-    } else {
+    }
+    else {
         attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
     }
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
@@ -140,7 +143,6 @@ static vsi_bool op_setup
         reshape_node = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
         reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(reshape_node,
             VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
-
         for(i = 0; i < p->dim_num; i++) {
             reshape_input_size[i] = 1;
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
index 8026198..0e8fcf0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
@@ -122,259 +122,95 @@ static vsi_bool op_check
 
     ret = vsi_nn_OpCheck(VSI_NN_OP_FCL_RELU, self, inputs, outputs);
 
-    if(!ret) {
-        /* check inputs outputs data type */
-        BEGIN_IO_TYPE_DECL(FCL, 3, 1)
-            /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */
-            IO_TYPE(D_F16, D_F16, D_NONE, D_F16)
-            IO_TYPE(D_F16, D_F16, D_F32, D_F16)
-            IO_TYPE(D_F16, D_U8|Q_ASYM, D_F32, D_U8|Q_ASYM)
+    if (!ret)
+    {
+        /* check input and weight data type */
+        BEGIN_IO_TYPE_DECL(FCL, 2, 0)
+            /* IO_TYPE(INPUT, WEIGHT) */
+        IO_TYPE(D_F16,          D_F16)
+        IO_TYPE(D_F32,          D_F32)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,         D_BF16)
 
-            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP)
+        /* HW 9.0.1 */
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_SYM_PC)
+        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_SYM_PC)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_SYM_PC)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_SYM_PC)
+        IO_TYPE(D_BF16,         D_BF16)
+        IO_TYPE(D_F32,          D_BF16)
 
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F16)
+        /* HW 9.1.1 */
+        IO_TYPE(D_U4|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I4|Q_DFP,     D_I8|Q_DFP)
 
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16)
+        IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I4|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I4|Q_SYM,     D_U8|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_I8|Q_SYM_PC)
 
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_ASYM)
 
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,     D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
 
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_SYM_PC)
+        IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_SYM_PC)
+        IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM_PC)
+        IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM_PC)
 
-            /* HW 9.0.1 */
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
-
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
-
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_F16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I16|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_I16|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_F32)
-
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_F16)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_F32)
-
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_F16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_F32)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_F32)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_F32)
-
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_F32)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_F32)
-
-            IO_TYPE(D_F16,       D_F16,          D_NONE,          D_BF16)
-            IO_TYPE(D_F16,       D_F16,          D_NONE,          D_F32)
-            IO_TYPE(D_F16,       D_F16,          D_F32,           D_BF16)
-            IO_TYPE(D_F16,       D_F16,          D_F32,           D_F32)
-
-            IO_TYPE(D_BF16,      D_BF16,        D_NONE,          D_F16)
-            IO_TYPE(D_BF16,      D_BF16,        D_NONE,          D_BF16)
-            IO_TYPE(D_BF16,      D_BF16,        D_NONE,          D_F32)
-            IO_TYPE(D_BF16,      D_BF16,        D_F32,           D_F16)
-            IO_TYPE(D_BF16,      D_BF16,        D_F32,           D_BF16)
-            IO_TYPE(D_BF16,      D_BF16,        D_F32,           D_F32)
-
-            IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_F16)
-            IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_BF16)
-            IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_F32)
-            IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F16)
-            IO_TYPE(D_F32,       D_BF16,         D_F32,           D_BF16)
-            IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F32)
-
-            /* HW 9.1.1 */
-            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I4|Q_DFP)
-
-            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_SYM,     D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_SYM,     D_I4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_I4|Q_SYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM,     D_I4|Q_SYM)
-            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I4|Q_SYM)
-            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_SYM)
-            IO_TYPE(D_I4|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I4|Q_SYM)
-            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U4|Q_ASYM)
+        /* HW 9.2 */
+        IO_TYPE(D_U16|Q_ASYM,   D_U16|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM,   D_U16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,     D_U16|Q_ASYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_I4|Q_SYM,     D_U16|Q_ASYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_U16|Q_ASYM)
+        IO_TYPE(D_U4|Q_SYM,     D_U16|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I16|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I8|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I8|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I4|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I4|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_U4|Q_ASYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_U4|Q_SYM)
+        IO_TYPE(D_U16|Q_ASYM,   D_I8|Q_SYM_PC)
+        IO_TYPE(D_U16|Q_ASYM,   D_U8|Q_SYM_PC)
+        IO_TYPE(D_U16|Q_ASYM,   D_I16|Q_SYM_PC)
 
         END_IO_TYPE_DECL(FCL)
-        ret = VALIDATE_OP_IO_TYPES(FCL, self, inputs, self->input.num, outputs, self->output.num);
-        if(!ret) {
+        ret = VALIDATE_OP_IO_TYPES(FCL, self, inputs, 2, outputs, 0);
+        if (!ret)
+        {
             char* desc = generate_op_io_types_desc(inputs,
                     self->input.num, outputs, self->output.num);
             VSILOGE("Inputs/Outputs data type not support: %s", desc);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index f2b9142..1f3f281 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -30,6 +30,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
@@ -53,21 +54,22 @@ static vsi_status op_compute
     int32_t axis = self->nn_param.gather.axis;
     int32_t batch_dims = self->nn_param.gather.batch_dims;
     vsi_size_t *input_size = inputs[0]->attr.size;
-    uint32_t dims_num = inputs[0]->attr.dim_num;
+    uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num;
+    uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num;
 
-    param =vsi_nn_kernel_param_create();
+    param = vsi_nn_kernel_param_create();
 
-    for(i = 0; i < (uint32_t)axis; ++i)
+    for (i = 0; i < (uint32_t)axis; ++i)
     {
         block_size *= input_size[i];
     }
 
     axis_num = input_size[axis];
-    for(i = axis + 1; i < dims_num - batch_dims; ++i)
+    for (i = axis + 1; i < r_rank - batch_dims; ++i)
     {
         block_num *= input_size[i];
     }
-    for(i = 0; i < (uint32_t)inputs[1]->attr.dim_num - batch_dims; ++i)
+    for (i = 0; i < q_rank - batch_dims; ++i)
     {
         indices_num *= inputs[1]->attr.size[i];
     }
@@ -79,13 +81,13 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num );
     vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims );
     n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param );
-    if( n != NULL )
+    if ( n != NULL )
     {
         self->n = (vx_node)n;
         status = VSI_SUCCESS;
     }
 
-    if(param != NULL)
+    if (param != NULL)
     {
         vsi_nn_kernel_param_release( &param );
     }
@@ -165,23 +167,38 @@ static vsi_bool op_setup
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         uint32_t j = 0;
+        uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num;
+        uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num;
+        uint32_t o_rank = r_rank + q_rank - 1;
+
         p = &(self->nn_param.gather);
-        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + inputs[1]->attr.dim_num - 1;
-        for (i = 0; i < (uint32_t)p->axis; i++)
+
+        outputs[0]->attr.dim_num = o_rank ? o_rank : 1;
+
+        if (o_rank == 0)
         {
-            outputs[0]->attr.size[j] = inputs[0]->attr.size[i];
-            j++;
+            outputs[0]->attr.size[0] = 1;
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
         }
-        for (i = 0; i < inputs[1]->attr.dim_num; i++)
+        else
         {
-            outputs[0]->attr.size[j] = inputs[1]->attr.size[i];
-            j++;
-        }
-        for (i = (uint32_t)p->axis + 1; i < inputs[0]->attr.dim_num; i++)
-        {
-            outputs[0]->attr.size[j] = inputs[0]->attr.size[i];
-            j++;
+            for (i = 0; i < (uint32_t)p->axis; i++)
+            {
+                outputs[0]->attr.size[j] = inputs[0]->attr.size[i];
+                j++;
+            }
+            for (i = 0; i < inputs[1]->attr.dim_num; i++)
+            {
+                outputs[0]->attr.size[j] = inputs[1]->attr.size[i];
+                j++;
+            }
+            for (i = (uint32_t)p->axis + 1; i < inputs[0]->attr.dim_num; i++)
+            {
+                outputs[0]->attr.size[j] = inputs[0]->attr.size[i];
+                j++;
+            }
         }
+
     }
     return TRUE;
 } /* op_setup() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
index e77633f..4246ee6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
@@ -50,6 +50,7 @@ static vsi_status op_compute
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
     vsi_size_t i = 0;
+    int32_t batch_dims = self->nn_param.gather_nd.batch_dims == 0 ? 0 : 1;
     vsi_size_t block_size = 1, coord_dim = 1;
     vsi_size_t *input_size = inputs[0]->attr.size;
     vsi_size_t dims_num = inputs[0]->attr.dim_num;
@@ -58,7 +59,8 @@ static vsi_status op_compute
     {
         coord_dim = inputs[1]->attr.size[0];
     }
-    if (coord_dim > 4 || (coord_dim > 3 && input_size[dims_num - 1] != 1))
+    if (coord_dim > 4 || (coord_dim > 3 && input_size[dims_num - 1] != 1)
+        || (batch_dims && coord_dim >= 3))
     {
         CHECK_STATUS(status);
         return status;
@@ -66,13 +68,14 @@ static vsi_status op_compute
 
     param = vsi_nn_kernel_param_create();
 
-    for(i = 0; i < dims_num - coord_dim; ++i)
+    for(i = 0; i < dims_num - coord_dim - batch_dims; ++i)
     {
         block_size *= input_size[i];
     }
 
     vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size );
     vsi_nn_kernel_param_add_int32( param, "coord_dim", (int32_t)coord_dim );
+    vsi_nn_kernel_param_add_int32( param, "batch_dims", (int32_t)batch_dims );
     n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param );
     if ( n != NULL )
     {
@@ -133,6 +136,18 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.gather_nd.batch_dims = 0;
+
+    return status;
+} /* op_init() */
+
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
@@ -146,12 +161,14 @@ static vsi_bool op_setup
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         vsi_size_t j = 0, coord_dim = 1;
+        int32_t batch_dims = self->nn_param.gather_nd.batch_dims == 0 ? 0 : 1;
+
         if (inputs[1]->attr.dim_num > 1)
         {
             coord_dim = inputs[1]->attr.size[0];
         }
 
-        for (i = 0; i < (uint32_t)inputs[0]->attr.dim_num - coord_dim; i++)
+        for (i = 0; i < (uint32_t)inputs[0]->attr.dim_num - coord_dim - batch_dims; i++)
         {
             outputs[0]->attr.size[j++] = inputs[0]->attr.size[i];
         }
@@ -185,7 +202,7 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ GATHER_ND,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c
new file mode 100644
index 0000000..de9059e
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_globallppool.c
@@ -0,0 +1,199 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _globallppool_local_data_t {
+    int32_t placeholder;
+} globallppool_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    uint32_t i = 0;
+    uint32_t new_rank = 0;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    int32_t p          = (int32_t)self->nn_param.globallppool.p;
+    new_rank = 3;
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1];
+    shapes[0][2] = inputs[0]->attr.size[2];
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    shapes[1][2] = outputs[0]->attr.size[2];
+
+    for (i = 3; i < inputs[0]->attr.dim_num; i++)
+    {
+        shapes[0][2] = shapes[0][2] * inputs[0]->attr.size[i];
+        shapes[1][2] = shapes[1][2] * outputs[0]->attr.size[i];
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], shapes[1], new_rank );
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32(param, "p", p);
+
+    self->n = (vx_node)vsi_nn_kernel_selector(self->graph,"globallppool",
+        &reshape_tensors[0],_INPUT_NUM,&reshape_tensors[1],_OUTPUT_NUM,param);
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release(&param);
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(GLOBALLPPOOL, 1, 1)
+        IO_TYPE(D_F32,   D_F32)
+        IO_TYPE(D_F16,   D_F16)
+        IO_TYPE(D_BF16, D_BF16)
+        IO_TYPE(D_I16|Q_SYM,   D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,    D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,    D_I8|Q_DFP)
+        IO_TYPE(D_F32,   D_I16|Q_SYM)
+        IO_TYPE(D_F16,   D_I16|Q_SYM)
+        IO_TYPE(D_F32,   D_I16|Q_DFP)
+        IO_TYPE(D_F16,   D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,   D_F32)
+        IO_TYPE(D_U8|Q_ASYM,   D_F32)
+        IO_TYPE(D_I8|Q_SYM,    D_F32)
+        IO_TYPE(D_I16|Q_DFP,   D_F32)
+        IO_TYPE(D_I8|Q_DFP,    D_F32)
+        IO_TYPE(D_F32,   D_U8|Q_ASYM)
+        IO_TYPE(D_F16,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,   D_F16)
+        IO_TYPE(D_U8|Q_ASYM,   D_F16)
+        IO_TYPE(D_I8|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,   D_F16)
+        IO_TYPE(D_I8|Q_DFP,    D_F16)
+        IO_TYPE(D_F32,   D_I8|Q_SYM)
+        IO_TYPE(D_F16,   D_I8|Q_SYM)
+        IO_TYPE(D_F32,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,   D_I8|Q_DFP)
+    END_IO_TYPE_DECL(GLOBALLPPOOL)
+
+    if (!VALIDATE_OP_IO_TYPES(
+            GLOBALLPPOOL, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_size_t i = 0;
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = 1;
+        outputs[0]->attr.size[1] = 1;
+        for (i = 2; i < outputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    self->nn_param.globallppool.p = 2;
+
+    return status;
+} /* op_init() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ GLOBALLPPOOL,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
new file mode 100644
index 0000000..d8c99aa
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
@@ -0,0 +1,165 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+
+    status = vsi_nn_internal_compute_node(self);
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if (VSI_NN_INTERPOLATION_BILINEAR != self->nn_param.gridsample.mode) {
+        VSILOGE("Only support bilinear_grid_sample now!");
+        return FALSE;
+    }
+
+    if (!((VSI_NN_PAD_MODE_CONSTANT ==
+           self->nn_param.gridsample.padding_mode) &&
+          (0 == self->nn_param.gridsample.const_val))) {
+        VSILOGE("Only support padding const 0 now!");
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_internal_node_t* curr = NULL;
+
+    if (NULL == self) {
+        return FALSE;
+    }
+
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = inputs[1]->attr.size[1];
+        outputs[0]->attr.size[1] = inputs[1]->attr.size[2];
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+        if (4 == inputs[0]->attr.dim_num) {
+            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+        }
+    }
+
+    if (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.gridsample.mode) {
+        vsi_nn_internal_init_node_wksp(self);
+        curr = vsi_nn_internal_new_node(
+            self, VSI_NN_OP_BILINEAR_GRID_SAMPLE, 2, 1);
+        curr->node->nn_param.bilinear_grid_sample.align_corners =
+            self->nn_param.gridsample.align_corners;
+        curr->node->nn_param.bilinear_grid_sample.padding_mode =
+            self->nn_param.gridsample.padding_mode;
+        curr->node->nn_param.bilinear_grid_sample.const_val =
+            self->nn_param.gridsample.const_val;
+        curr->inputs[0]  = inputs[0];
+        curr->inputs[1]  = inputs[1];
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node(self, curr);
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    /* TODO
+    //self->nn_param.grid_sample.local = \
+    //    (grid_sample_local_data_t*)malloc(sizeof(grid_sample_local_data_t));
+    */
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_internal_deinit_node_wksp(self);
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ GRID_SAMPLE,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
index 3522896..5cfeddf 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
@@ -37,6 +37,7 @@
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
@@ -124,8 +125,35 @@ static vsi_bool op_setup
 
     p->local->input = _expand_tensor_dim( self->graph, inputs[0],
             inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 );
-    p->local->weight = _expand_tensor_dim( self->graph, inputs[1],
+    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+    {
+        p->local->weight = _expand_tensor_dim( self->graph, inputs[1],
             inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
+    }
+    else
+    {
+        uint32_t i = 0;
+        uint8_t * data = NULL;
+        vsi_nn_tensor_attr_t attr;
+
+        data = vsi_nn_ConvertTensorToData( self->graph, inputs[1] );
+        CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
+
+        memcpy(&attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t));
+
+        attr.size[0] = 1;
+        attr.size[1] = inputs[1]->attr.size[0];
+        for (i = 2; i <= inputs[1]->attr.dim_num; i++)
+        {
+            attr.size[i] = inputs[1]->attr.size[i - 1];
+        }
+        attr.dim_num = inputs[1]->attr.dim_num + 1;
+        attr.dtype.channel_dim = inputs[1]->attr.dtype.channel_dim + 1;
+
+        p->local->weight = vsi_nn_CreateTensorFromData(self->graph, data, &attr);
+        vsi_nn_safe_free( data );
+    }
+
     p->local->output = _expand_tensor_dim( self->graph, outputs[0],
             outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 );
 
@@ -140,8 +168,8 @@ static vsi_bool op_setup
     curr->node->nn_param.grouped_conv2d.dilation[0] = 1;
     curr->node->nn_param.grouped_conv2d.dilation[1] = p->dilation;
     curr->node->nn_param.grouped_conv2d.pad[0] = 0;
-    curr->node->nn_param.grouped_conv2d.pad[1] = p->pad[0];
-    curr->node->nn_param.grouped_conv2d.pad[2] = 0;
+    curr->node->nn_param.grouped_conv2d.pad[1] = 0;
+    curr->node->nn_param.grouped_conv2d.pad[2] = p->pad[0];
     curr->node->nn_param.grouped_conv2d.pad[3] = p->pad[1];
     curr->node->nn_param.grouped_conv2d.stride[0] = 1;
     curr->node->nn_param.grouped_conv2d.stride[1] = p->stride;
@@ -153,6 +181,7 @@ static vsi_bool op_setup
 
     vsi_nn_internal_setup_node(self, curr);
 
+final:
     return TRUE;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
index 5afb30b..5ac947b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
@@ -186,6 +186,7 @@ static vsi_bool op_setup_default
     vsi_size_t batch_size = 0;
     vsi_size_t time_step = 0;
     vsi_size_t i = 0;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_node_wksp( self );
@@ -306,7 +307,7 @@ static vsi_bool op_setup_default
         curr->outputs[GRUCELL_OUTPUT_OUTPUT] = grucell_out0;
         curr->outputs[GRUCELL_OUTPUT_H_STATE] = grucell_out1;
 
-        vsi_nn_internal_setup_node( self, curr );
+        ret = vsi_nn_internal_setup_node( self, curr );
 
         last_step_h_state = grucell_out1;
 
@@ -353,7 +354,7 @@ final:
     vsi_nn_safe_free( split_output_tensors );
     vsi_nn_safe_free( grucell_reshape_output_tensors );
 
-    return TRUE;
+    return ret;
 } /* op_setup_default() */
 
 static vsi_bool op_setup_optimized
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
index 31df29c..020ab32 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
@@ -889,6 +889,10 @@ static vsi_bool op_setup_default
                                                 inputs[GRUCELL_INPUT_BIAS_H2R + i],
                                                 &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2R + i],
                                                 use_virtual_tensor);
+            if (hstate_gate_fc_outputs[i] == NULL)
+            {
+                goto error;
+            }
         }
     }
     else
@@ -1165,6 +1169,10 @@ static vsi_bool op_setup_default
     vsi_nn_internal_setup_node(self, curr);
 
     return TRUE;
+
+error:
+    return FALSE;
+
 } /* op_setup() */
 
 static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
index be1f3f5..53c12ae 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
+#include "vsi_nn_tensor_util_prv.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
@@ -173,51 +174,56 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(INSTANCE_NORM, 3, 1)
-        IO_TYPE(D_F16,        D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_F16,        D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_F32,        D_F32,  D_F16,  D_F32)
-        IO_TYPE(D_F32,        D_F16,  D_F16,  D_F32)
-        IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32)
-        IO_TYPE(D_I32,        D_F32,  D_F16,  D_I32)
-        IO_TYPE(D_I32,        D_F32,  D_F16,  D_F32)
-        IO_TYPE(D_BF16,       D_F32,  D_F32,  D_BF16)
-        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_I8|Q_SYM)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_ASYM)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_SYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_ASYM)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
-    END_IO_TYPE_DECL(INSTANCE_NORM)
-    if (!VALIDATE_OP_IO_TYPES(INSTANCE_NORM, self, inputs, self->input.num, outputs, self->output.num))
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
+
+    if (!ret)
     {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
+        BEGIN_IO_TYPE_DECL(INSTANCE_NORM, 3, 1)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_F16,        D_F16,  D_F16,  D_F16)
+            IO_TYPE(D_F32,        D_F32,  D_F16,  D_F32)
+            IO_TYPE(D_F32,        D_F16,  D_F16,  D_F32)
+            IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32)
+            IO_TYPE(D_I32,        D_F32,  D_F16,  D_I32)
+            IO_TYPE(D_I32,        D_F32,  D_F16,  D_F32)
+            IO_TYPE(D_BF16,       D_F32,  D_F32,  D_BF16)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
+        END_IO_TYPE_DECL(INSTANCE_NORM)
+        if (!VALIDATE_OP_IO_TYPES(INSTANCE_NORM, self, inputs, self->input.num, outputs, self->output.num))
+        {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
     }
 
     return TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
index 2df9bc2..cff1507 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
@@ -33,6 +33,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_tensor_util_prv.h"
 
 static vsi_status op_compute
     (
@@ -51,7 +52,7 @@ static vsi_status op_compute
     self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
         "l2_norm",
         inputs, 1,
-        outputs, 1, param );;
+        outputs, 1, param );
 
     if( NULL != self->n )
     {
@@ -85,30 +86,35 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(L2_NORMALIZE, 1, 1)
-        IO_TYPE(D_F32,  D_F16)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_I32,  D_I32)
-        IO_TYPE(D_BF16, D_BF16)
-        IO_TYPE(D_F16,  D_F32)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16)
-    END_IO_TYPE_DECL(L2_NORMALIZE)
-    if (!VALIDATE_OP_IO_TYPES(L2_NORMALIZE, self, inputs, self->input.num, outputs, self->output.num))
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
+
+    if (!ret)
     {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
+        BEGIN_IO_TYPE_DECL(L2_NORMALIZE, 1, 1)
+            IO_TYPE(D_F32,  D_F16)
+            IO_TYPE(D_F32,  D_F32)
+            IO_TYPE(D_I32,  D_I32)
+            IO_TYPE(D_BF16, D_BF16)
+            IO_TYPE(D_F16,  D_F32)
+            IO_TYPE(D_F16,  D_F16)
+            IO_TYPE(D_F16,  D_U8|Q_ASYM)
+            IO_TYPE(D_F16,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,   D_F16)
+            IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_DFP,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,  D_F16)
+        END_IO_TYPE_DECL(L2_NORMALIZE)
+        if (!VALIDATE_OP_IO_TYPES(L2_NORMALIZE, self, inputs, self->input.num, outputs, self->output.num))
+        {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
     }
 
     return TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index f8330b7..75354a7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
+#include "vsi_nn_tensor_util_prv.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -154,61 +155,66 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1)
-        IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32)
-        IO_TYPE(D_F16,        D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_F16,        D_F32,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I8|Q_ASYM)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_ASYM)
-        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I8|Q_SYM)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_SYM)
-        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I16|Q_ASYM)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_ASYM)
-        IO_TYPE(D_F16,        D_F32,  D_F16,  D_I16|Q_SYM)
-        IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_SYM)
-        IO_TYPE(D_BF16,       D_F32,  D_F32,  D_BF16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_I8|Q_SYM)
-        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_I8|Q_SYM)
-        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F16)
-        IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F16)
-    END_IO_TYPE_DECL(LAYER_NORM)
-    if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
+
+    if (!ret)
     {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
+        BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1)
+            IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_U8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_I8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_I8|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_I16|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_I16|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_BF16,       D_F32,  D_F32,  D_BF16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F16)
+        END_IO_TYPE_DECL(LAYER_NORM)
+        if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
+        {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
     }
 
     return TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c
index a607f70..e44440e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c
@@ -34,7 +34,6 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_math.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _INPUT_NUM          (1)
@@ -48,37 +47,19 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
-    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
-    vsi_size_t new_rank = 0;
-    vsi_bool ret;
 
-    if( NULL == self )
+    if ( NULL == self )
     {
         return status;
     }
 
     // TODO: This optimzie is a hack for gpu path,
     // it should be moved to gpu kernel setup.
-    ret = vsi_nn_kernel_optimize_element_shape(
-            inputs[0]->attr.size, inputs[0]->attr.dim_num,
-            shape, &new_rank );
-    if( ret )
-    {
-        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
-                inputs[0], shape, new_rank );
-        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
-                outputs[0], shape, new_rank );
-
-        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-                "logical_not",
-                &reshape_tensors[0], _INPUT_NUM,
-                &reshape_tensors[1], _OUTPUT_NUM, NULL );
-
-        vsi_nn_ReleaseTensor( &reshape_tensors[0] );
-        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
-    }
-    if( self->n )
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "logical_not",
+            inputs, _INPUT_NUM,
+            outputs, _OUTPUT_NUM, NULL );
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
index 7b2d441..01695c4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _INPUT_NUM          (2)
@@ -49,53 +48,23 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
-    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
-    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
-    vsi_size_t new_rank = 0;
-    vsi_bool ret;
 
-    if( NULL == self )
+    if ( NULL == self )
     {
         return VSI_FAILURE;
     }
 
-    ret = vsi_nn_kernel_optimize_eltwise_shape(
-            inputs[0]->attr.size, inputs[0]->attr.dim_num,
-            inputs[1]->attr.size, inputs[1]->attr.dim_num,
-            outputs[0]->attr.size, outputs[0]->attr.dim_num,
-            shapes[0], shapes[1], shapes[2], &new_rank );
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "ops_type", self->nn_param.logical_ops.op );
 
-    if( ret )
-    {
-        param =vsi_nn_kernel_param_create();
-        vsi_nn_kernel_param_add_int32( param, "ops_type", self->nn_param.logical_ops.op );
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "logical_ops",
+            inputs, _INPUT_NUM,
+            outputs, _OUTPUT_NUM, param );
 
-        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
-                inputs[0], shapes[0], new_rank );
-        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
-                inputs[1], shapes[1], new_rank );
-        reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
-                outputs[0], shapes[2], new_rank );
+    vsi_nn_kernel_param_release( &param );
 
-        if (shapes[1][3] > shapes[0][3] && new_rank == 4)
-        {
-            vsi_nn_tensor_t* reshape_tmp;
-            reshape_tmp = reshape_tensors[0];
-            reshape_tensors[0] = reshape_tensors[1];
-            reshape_tensors[1] = reshape_tmp;
-        }
-
-        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "logical_ops",
-                                                 &reshape_tensors[0], _INPUT_NUM,
-                                                 &reshape_tensors[2], _OUTPUT_NUM, param );
-        vsi_nn_ReleaseTensor( &reshape_tensors[0] );
-        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
-        vsi_nn_ReleaseTensor( &reshape_tensors[2] );
-
-        vsi_nn_kernel_param_release( &param );
-    }
-
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c
new file mode 100644
index 0000000..7a3eb91
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lpnorm.c
@@ -0,0 +1,205 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _lpnorm_local_data_t {
+    int32_t placeholder;
+} lpnorm_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    uint32_t new_rank = 0;
+    int32_t  new_axis = 0;
+    vsi_size_t shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}};
+    int32_t p = (int32_t)self->nn_param.lpnorm.p;
+    int32_t axis = (int32_t)self->nn_param.lpnorm.axis;
+    int32_t dim = (int32_t)inputs[0]->attr.dim_num;
+
+    if (axis == -1) axis = dim - 1;
+    vsi_nn_kernel_optimize_softmax_shape(inputs[0]->attr.size,
+                                         inputs[0]->attr.dim_num,
+                                         axis,
+                                         shapes[0],
+                                         &new_rank,
+                                         &new_axis);
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], shapes[0], new_rank );
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32(param, "p", p);
+
+    if (p == 1)
+    {
+        vsi_nn_kernel_param_add_int32(param, "axis", new_axis);
+        self->n = (vx_node)vsi_nn_kernel_selector(self->graph,"l1norm",
+            &reshape_tensors[0], _INPUT_NUM, &reshape_tensors[1], _OUTPUT_NUM, param);
+    }
+    else
+    {
+        vsi_nn_kernel_param_add_int32(param, "axis", axis);
+        self->n = (vx_node)vsi_nn_kernel_selector(self->graph,"l2_norm",
+            inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param);
+    }
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release(&param);
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(LPNORM, 1, 1)
+        IO_TYPE(D_F32,   D_F32)
+        IO_TYPE(D_F16,   D_F16)
+        IO_TYPE(D_I16|Q_SYM,   D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,    D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,    D_I8|Q_DFP)
+        IO_TYPE(D_F32,   D_I16|Q_SYM)
+        IO_TYPE(D_F16,   D_I16|Q_SYM)
+        IO_TYPE(D_F32,   D_I16|Q_DFP)
+        IO_TYPE(D_F16,   D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,   D_F32)
+        IO_TYPE(D_U8|Q_ASYM,   D_F32)
+        IO_TYPE(D_I8|Q_SYM,    D_F32)
+        IO_TYPE(D_I16|Q_DFP,   D_F32)
+        IO_TYPE(D_I8|Q_DFP,    D_F32)
+        IO_TYPE(D_F32,   D_U8|Q_ASYM)
+        IO_TYPE(D_F16,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,   D_F16)
+        IO_TYPE(D_U8|Q_ASYM,   D_F16)
+        IO_TYPE(D_I8|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,   D_F16)
+        IO_TYPE(D_I8|Q_DFP,    D_F16)
+        IO_TYPE(D_F32,   D_I8|Q_SYM)
+        IO_TYPE(D_F16,   D_I8|Q_SYM)
+        IO_TYPE(D_F32,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,   D_I8|Q_DFP)
+    END_IO_TYPE_DECL(LPNORM)
+
+    if (!VALIDATE_OP_IO_TYPES(
+            LPNORM, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_size_t i = 0;
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        for (i = 0; i < outputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.lpnorm.p = 2;
+    self->nn_param.lpnorm.axis = -1;
+    return VSI_SUCCESS;
+} /* op_init() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ LPNORM,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c
index 1758ac1..baca369 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c
@@ -215,7 +215,7 @@ static vsi_bool op_setup
             (
             inputs[0]->attr.size[1],
             self->nn_param.lppool.ksize[1],
-            &self->nn_param.lppool.pad[1],
+            &self->nn_param.lppool.pad[2],
             self->nn_param.lppool.stride[1],
             0,
             VSI_NN_ROUND_CEIL
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
index bcdc2d9..27b5457 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
@@ -258,6 +258,8 @@ static vsi_status op_deinit
         self->nn_param.lstmunit_activation.local.lstmunit_param = NULL;
     }
 
+    status = vsi_nn_op_common_deinit(self);
+
     return status;
 } /* op_deinit() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
index 5da258f..8463390 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
@@ -150,6 +150,7 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP,  D_I16|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_F16)
         IO_TYPE(D_I8|Q_DFP,  D_F16,  D_I8|Q_DFP)
@@ -175,6 +176,7 @@ static vsi_bool op_check
         IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
         IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
         IO_TYPE(D_BF16, D_BF16, D_BF16)
+        IO_TYPE(D_I32,  D_I32,  D_I32)
     END_IO_TYPE_DECL(MATRIXMUL)
     if (!VALIDATE_OP_IO_TYPES(MATRIXMUL, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_maxunpool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_maxunpool.c
new file mode 100644
index 0000000..b1e0676
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_maxunpool.c
@@ -0,0 +1,248 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _maxunpool_local_data_t {
+    int32_t placeholder;
+} maxunpool_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    uint32_t i = 0;
+    uint32_t new_rank = 0;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    int32_t ksize_x    = (int32_t)self->nn_param.maxunpool.ksize[0];
+    int32_t ksize_y    = (int32_t)self->nn_param.maxunpool.ksize[1];
+    int32_t pad_left   = (int32_t)self->nn_param.maxunpool.pad[0];
+    int32_t pad_right  = (int32_t)self->nn_param.maxunpool.pad[1];
+    int32_t pad_top    = (int32_t)self->nn_param.maxunpool.pad[2];
+    int32_t pad_bottom = (int32_t)self->nn_param.maxunpool.pad[3];
+    int32_t stride_x   = (int32_t)self->nn_param.maxunpool.stride[0];
+    int32_t stride_y   = (int32_t)self->nn_param.maxunpool.stride[1];
+    new_rank = 3;
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1];
+    shapes[0][2] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    shapes[1][2] = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
+
+    for (i = 3; i < inputs[0]->attr.dim_num; i++)
+    {
+        shapes[0][2] = shapes[0][2] * inputs[0]->attr.size[i];
+        shapes[1][2] = shapes[1][2] * outputs[0]->attr.size[i];
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            inputs[1], shapes[0], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], shapes[1], new_rank );
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32(param, "ksize_x", ksize_x);
+    vsi_nn_kernel_param_add_int32(param, "ksize_y", ksize_y);
+    vsi_nn_kernel_param_add_int32(param, "pad_left", pad_left);
+    vsi_nn_kernel_param_add_int32(param, "pad_right", pad_right);
+    vsi_nn_kernel_param_add_int32(param, "pad_top", pad_top);
+    vsi_nn_kernel_param_add_int32(param, "pad_bottom", pad_bottom);
+    vsi_nn_kernel_param_add_int32(param, "stride_x", stride_x);
+    vsi_nn_kernel_param_add_int32(param, "stride_y", stride_y);
+
+    self->n = (vx_node)vsi_nn_kernel_selector(self->graph,"maxunpool",
+        &reshape_tensors[0],_INPUT_NUM,&reshape_tensors[2],_OUTPUT_NUM,param);
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release(&param);
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(MAXUNPOOL, 2, 1)
+        IO_TYPE(D_F32,        D_I32,   D_F32)
+        IO_TYPE(D_F16,        D_I32,   D_F16)
+        IO_TYPE(D_BF16,       D_I32,   D_BF16)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,   D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,   D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,   D_I8|Q_DFP)
+        IO_TYPE(D_F32,        D_I32,   D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_I32,   D_I16|Q_SYM)
+        IO_TYPE(D_F32,        D_I32,   D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I32,   D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,   D_F32)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,   D_F32)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,   D_F32)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,   D_F32)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,   D_F32)
+        IO_TYPE(D_F32,        D_I32,   D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I32,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,   D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,   D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,   D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,   D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,   D_F16)
+        IO_TYPE(D_F32,        D_I32,   D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_I32,   D_I8|Q_SYM)
+        IO_TYPE(D_F32,        D_I32,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_I32,   D_I8|Q_DFP)
+    END_IO_TYPE_DECL(MAXUNPOOL)
+
+    if (!VALIDATE_OP_IO_TYPES(
+            MAXUNPOOL, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_size_t i = 0;
+    vsi_size_t stride[_cnt_of_array(self->nn_param.maxunpool.stride)] = {0};
+    vsi_size_t ksize[_cnt_of_array(self->nn_param.maxunpool.ksize)] = {0};
+    vsi_size_t pad[_cnt_of_array(self->nn_param.maxunpool.pad)] = {0};
+    vsi_size_t unpool_size[2] = {0};
+
+    for (i = 0; i < _cnt_of_array(self->nn_param.maxunpool.stride); i++)
+    {
+        stride[i] = self->nn_param.maxunpool.stride[i];
+    }
+    for (i = 0; i < _cnt_of_array(self->nn_param.maxunpool.ksize); i++)
+    {
+        ksize[i] = self->nn_param.maxunpool.ksize[i];
+    }
+
+    unpool_size[0] = inputs[0]->attr.size[0] * stride[0] + ksize[0] - stride[0];
+    unpool_size[1] = inputs[0]->attr.size[1] * stride[1] + ksize[1] - stride[1];
+    outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+
+    if ( self->nn_param.maxunpool.output_size == NULL )
+    {
+        for (i = 0; i < _cnt_of_array(self->nn_param.maxunpool.pad); i++)
+        {
+            pad[i] = self->nn_param.maxunpool.pad[i];
+        }
+
+        outputs[0]->attr.size[0] = unpool_size[0] + pad[0] + pad[1];
+        outputs[0]->attr.size[1] = unpool_size[1] + pad[2] + pad[3];
+    }
+    else
+    {
+        vsi_size_t total_pads = self->nn_param.maxunpool.output_size[0] - unpool_size[0];
+        pad[0] = total_pads / 2;
+        pad[1] = total_pads - pad[0];
+
+        total_pads = self->nn_param.maxunpool.output_size[1] - unpool_size[1];
+        pad[2] = total_pads / 2;
+        pad[3] = total_pads - pad[2];
+
+        outputs[0]->attr.size[0] = self->nn_param.maxunpool.output_size[0];
+        outputs[0]->attr.size[1] = self->nn_param.maxunpool.output_size[1];
+    }
+
+    for (i = 2; i < outputs[0]->attr.dim_num; i++)
+    {
+        outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+    }
+
+    for (i = 0; i < _cnt_of_array(self->nn_param.maxunpool.ksize); i++)
+    {
+        self->nn_param.maxunpool.ksize[i] = (uint32_t)ksize[i];
+    }
+    for (i = 0; i < _cnt_of_array(self->nn_param.maxunpool.pad); i++)
+    {
+        self->nn_param.maxunpool.pad[i] = (uint32_t)pad[i];
+    }
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ MAXUNPOOL,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
index eb15ccc..8276c0f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c
@@ -146,7 +146,7 @@ static vsi_status op_compute
         );
     if(NULL == node)
     {
-        vxReleaseKernel(&kernel);
+        vxRemoveKernel(kernel);
         return status;
     }
 
@@ -191,7 +191,7 @@ static vsi_status op_deinit
     kernel = self->nn_param.nbg.local.kernel;
     if(kernel)
     {
-        vxReleaseKernel(&kernel);
+        vxRemoveKernel(kernel);
         kernel = self->nn_param.nbg.local.kernel = NULL;
     }
     vsi_nn_op_common_deinit(self);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c
index cbd71ed..71a5e07 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_noop.c
@@ -24,6 +24,7 @@
 #include <string.h>
 
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
@@ -95,13 +96,13 @@ static vsi_bool op_setup
             {
                 if( NULL == inputs[0]->t )
                 {
-                    memcpy( inputs[0], outputs[i], sizeof( vsi_nn_tensor_t ) );
+                    memcpy( inputs[0], outputs[i], sizeof( vsi_nn_tensor_prv_t ) );
                 }
                 else
                 {
                     VSILOGE( "Invalid NOOP tensors." );
                     vxReleaseTensor( &outputs[i]->t );
-                    memcpy( outputs[i], inputs[0], sizeof( vsi_nn_tensor_t ) );
+                    memcpy( outputs[i], inputs[0], sizeof( vsi_nn_tensor_prv_t ) );
                 }
             }
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index c7f47af..aa5b46c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -92,6 +92,7 @@ static vsi_bool op_setup
     if ( p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420        ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444        ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12          ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB           ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY          ||
@@ -108,6 +109,11 @@ static vsi_bool op_setup
 
         for (i = 0; i < p->dim_num; i++)
         {
+            if (p->perm == NULL)
+            {
+                i = p->dim_num;
+                break;
+            }
             _axis = p->perm[i];
             if (_axis != i)
                 break;
@@ -438,6 +444,7 @@ static vsi_bool op_setup
             vsi_nn_internal_setup_node(self, curr);
         }
         break;
+    case VSI_NN_SOURCE_FORMAT_IMAGE_NV21:
     case VSI_NN_SOURCE_FORMAT_IMAGE_NV12:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 );
@@ -455,6 +462,15 @@ static vsi_bool op_setup
                 curr->node->nn_param.pre_process_nv12.b_mean = p->norm.mean[2];
             }
 
+            if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12)
+            {
+                curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV12;
+            }
+            else
+            {
+                curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV21;
+            }
+
             curr->node->nn_param.pre_process_nv12.rgb_scale = p->norm.scale;
             curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel;
             curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left;
@@ -544,6 +560,7 @@ static vsi_bool op_setup
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422       ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444        ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12          ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB           ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY          ||
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
index a60f446..09eb682 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
@@ -60,6 +60,7 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_nv12.reverse_channel );
     vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_nv12.local->enable_perm );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_nv12.local->enable_copy );
+    vsi_nn_kernel_param_add_int32( param, "nv_type", self->nn_param.pre_process_nv12.nv_type );
     n = vsi_nn_kernel_selector( self->graph, "pre_process_nv12", inputs, 2, outputs, 1, param );
     if( n != NULL )
     {
@@ -198,6 +199,8 @@ static vsi_status op_init
     self->nn_param.pre_process_nv12.local   =
     (vsi_nn_pre_process_nv12_lcl_data *)malloc(sizeof(vsi_nn_pre_process_nv12_lcl_data));
 
+    self->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV12;
+
     if (NULL == self->nn_param.pre_process_nv12.local)
     {
         return  VX_ERROR_NO_MEMORY;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index b5489bf..a7a5494 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -33,8 +33,8 @@
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
+#include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 #define _ARG_NUM            (6)
@@ -153,6 +153,7 @@ static vsi_bool _check_is_sp_supported_type
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t * input,
+    vsi_nn_tensor_t * output,
     vsi_enum type
     )
 {
@@ -168,14 +169,16 @@ static vsi_bool _check_is_sp_supported_type
     vsi_bool ret = FALSE;
 
     if ( !self->graph->ctx->config.support_stream_processor ||
-         (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN) )
+         (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
     {
         return FALSE;
     }
 
-    if (   (VSI_NN_TYPE_FLOAT64 == input->attr.dtype.vx_type)
-        || (VSI_NN_TYPE_UINT32  == input->attr.dtype.vx_type)
-        || (VSI_NN_TYPE_UINT64  == input->attr.dtype.vx_type)
+    if ( (VSI_NN_TYPE_FLOAT64 == input->attr.dtype.vx_type) ||
+         (VSI_NN_TYPE_UINT32  == input->attr.dtype.vx_type) ||
+         (VSI_NN_TYPE_UINT64  == input->attr.dtype.vx_type) ||
+         (vsi_nn_TypeGetBits(input->attr.dtype.vx_type) == 4) ||
+         (vsi_nn_TypeGetBits(output->attr.dtype.vx_type) == 4)
         )
     {
         return FALSE;
@@ -770,13 +773,25 @@ static vsi_bool op_set_reduce_axis(
         }
         *out_rank_x = inputs[0]->attr.dim_num;
     }
-    else
+    else if (!self->graph->ctx->config.support_stream_processor ||
+             resolved_dim_count > 2)
     {
         optimzation_input_size(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             out_shape_x, out_rank_x, (vsi_size_t*)resolved_dim, resolved_dim_count,
             (vsi_size_t*)resolved_dim2,  &resolved_dim_count2 );
     }
+    else
+    {
+        resolved_dim2[0] = resolved_dim[0];
+        resolved_dim2[1] = resolved_dim[1];
+        resolved_dim_count2 = resolved_dim_count;
+        for (i = 0; i < inputs[0]->attr.dim_num; i++)
+        {
+            out_shape_x[i] = (int32_t)(inputs[0]->attr.size[i]);
+        }
+        *out_rank_x = inputs[0]->attr.dim_num;
+    }
 
     for (i = 0; i < (uint32_t)resolved_dim_count2; i++)
     {
@@ -849,6 +864,7 @@ static vsi_bool op_set_sp_reduce_internal
     new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num);
 
     tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
+
     new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode,
         axes_num * sizeof(int32_t));
     for (i = 0; i < axes_num; i++)
@@ -859,13 +875,15 @@ static vsi_bool op_set_sp_reduce_internal
     tmp_inode->outputs[0] = new_output;
     tmp_inode->node->nn_param.reduce_mean_internal.axis = new_axis;
     tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
+    tmp_inode->node->nn_param.reduce_mean_internal.type = type_name;
     if (type_name == VSI_NN_REDUCE_SUM)
     {
         tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f;
     }
     else
     {
-        tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f / (float)reduce_size;
+        tmp_inode->node->nn_param.reduce_mean_internal.scale =
+            1.0f / (float)reduce_size;
     }
     vsi_nn_internal_setup_node(self, tmp_inode);
 
@@ -1199,7 +1217,7 @@ static vsi_bool op_setup
             outputs[0], shape, new_rank );
     self->nn_param.reduce.local2->reshaped_output1 = reshape_out_t[0];
 
-    if (_check_is_sp_supported_type(self, reshape_in_t[0], self->nn_param.reduce.type))
+    if (_check_is_sp_supported_type(self, reshape_in_t[0], reshape_out_t[0], self->nn_param.reduce.type))
     {
         self->nn_param.reduce.local2->use_internal_node = TRUE;
         ret = op_set_sp_reduce_internal(self, reshape_in_t, reshape_out_t, self->nn_param.reduce.type);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c
index 081f287..85047b0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c
@@ -66,8 +66,6 @@ static vsi_status _reduce_internal_op_compute
     }
     status = VSI_FAILURE;
 
-
-
     param =vsi_nn_kernel_param_create();
 
     if (strcmp(kernel_name, "reducemax_internal") == 0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
index 04c4271..4f50228 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
@@ -54,48 +54,28 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-    int32_t * axis = self->nn_param.reduce_mean_internal.axis;
     int32_t axis_num = self->nn_param.reduce_mean_internal.axis_num;
     float scale = self->nn_param.reduce_mean_internal.scale;
-    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { {0} };
-    int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
-    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
-    uint32_t axis_size = 0;
-    uint32_t rank_in = 0;
-    uint32_t rank_out = 0;
-    vsi_bool ret = FALSE;
+    vsi_enum type = self->nn_param.reduce_mean_internal.type;
     vsi_nn_kernel_param_t * param = NULL;
 
-    ret = vsi_nn_kernel_optimize_reduce_shape(
-            inputs[0]->attr.size, inputs[0]->attr.dim_num,
-            axis, axis_num,
-            outputs[0]->attr.size, outputs[0]->attr.dim_num,
-            shapes[0], &rank_in, shapes[1], &rank_out,
-            new_axis, &axis_size);
-
     param = vsi_nn_kernel_param_create();
-    vsi_nn_kernel_param_add_int32( param, "axis_num", axis_size );
+    vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num );
     vsi_nn_kernel_param_add_float32( param, "scale", scale );
 
-    if (ret)
+    if (type == VSI_NN_REDUCE_MAX)
+    {
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+                "reduce_max",
+                inputs, 1,
+                outputs, 1, param );
+    }
+    else
     {
-        uint32_t i = 0;
-        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
-            inputs[0], shapes[0], rank_in );
-        for (i = 0; i < axis_size; i++)
-        {
-            shapes[0][i] = 1;
-        }
-        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
-            outputs[0], shapes[0], rank_in );
-
         self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
                 "reduce_mean",
-                &reshape_tensors[0], 1,
-                &reshape_tensors[1], 1, param );
-
-        vsi_nn_ReleaseTensor( &reshape_tensors[0] );
-        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
+                inputs, 1,
+                outputs, 1, param );
     }
 
     if ( self->n )
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
index fffe060..0629226 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
@@ -35,7 +35,6 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_math.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 static vsi_status _comparisons_op_compute
@@ -47,14 +46,10 @@ static vsi_status _comparisons_op_compute
     )
 {
     vsi_status status;
-    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
-    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
-    vsi_size_t new_rank = 0;
-    vsi_bool ret;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_relational_ops_type_t op_type;
 
-    if( NULL == self )
+    if ( NULL == self )
     {
         return VSI_FAILURE;
     }
@@ -62,61 +57,16 @@ static vsi_status _comparisons_op_compute
 
     op_type = self->nn_param.relational_ops.op;
 
-    // TODO: This optimzie is a hack for gpu path,
-    // it should be moved to gpu kernel setup.
-    ret = vsi_nn_kernel_optimize_eltwise_shape(
-            inputs[0]->attr.size, inputs[0]->attr.dim_num,
-            inputs[1]->attr.size, inputs[1]->attr.dim_num,
-            outputs[0]->attr.size, outputs[0]->attr.dim_num,
-            shapes[0], shapes[1], shapes[2], &new_rank );
-    if ( ret )
-    {
-        // Add params
-        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
-                inputs[0], shapes[0], new_rank );
-        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
-                inputs[1], shapes[1], new_rank );
-        reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
-                outputs[0], shapes[2], new_rank );
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "operation", op_type );
 
-        if (shapes[1][3] > shapes[0][3] && new_rank == 4)
-        {
-            vsi_nn_tensor_t* reshape_tmp;
-            reshape_tmp = reshape_tensors[0];
-            reshape_tensors[0] = reshape_tensors[1];
-            reshape_tensors[1] = reshape_tmp;
-            if (VSI_NN_RELATIONAL_OPS_GREAT == op_type)
-            {
-                op_type = VSI_NN_RELATIONAL_OPS_LESS;
-            }
-            else if (VSI_NN_RELATIONAL_OPS_LESS == op_type)
-            {
-                op_type = VSI_NN_RELATIONAL_OPS_GREAT;
-            }
-            else if (VSI_NN_RELATIONAL_OPS_GREAT_EQUAL == op_type)
-            {
-                op_type = VSI_NN_RELATIONAL_OPS_LESS_EQUAL;
-            }
-            else if (VSI_NN_RELATIONAL_OPS_LESS_EQUAL == op_type)
-            {
-                op_type = VSI_NN_RELATIONAL_OPS_GREAT_EQUAL;
-            }
-        }
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            kernel_name,
+            inputs, 2,
+            outputs, 1, param );
 
-        param = vsi_nn_kernel_param_create();
-        vsi_nn_kernel_param_add_int32( param, "operation", op_type );
+    vsi_nn_kernel_param_release( &param );
 
-        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-                kernel_name,
-                &reshape_tensors[0], 2,
-                &reshape_tensors[2], 1, param );
-
-        vsi_nn_ReleaseTensor( &reshape_tensors[0] );
-        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
-        vsi_nn_ReleaseTensor( &reshape_tensors[2] );
-
-        vsi_nn_kernel_param_release( &param );
-    }
     if ( self->n )
     {
         status = VSI_SUCCESS;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
index 9deb02e..4395961 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@@ -37,6 +37,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
 
 static vsi_status op_compute
     (
@@ -81,8 +82,16 @@ static vsi_status op_compute
             inputs[0]->t, &reshape_param, sizeof(reshape_param), outputs[0]->t);
         vsi_safe_release_tensor(dims_tensor);
 #else
+        vsi_nn_tensor_t *tmp_tensor = NULL;
+        tmp_tensor = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
+        CHECK_PTR_FAIL_GOTO( tmp_tensor, "create tensor fail.", final );
+
         self->n = vxTensorCopyNode(self->graph->g,
-            inputs[0]->t, outputs[0]->t);
+            inputs[0]->t, tmp_tensor->t);
+
+final:
+        vsi_safe_release_tensor(tmp_tensor);
 #endif
         if (NULL == self->n)
         {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reversesequence.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reversesequence.c
new file mode 100644
index 0000000..a75dcef
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reversesequence.c
@@ -0,0 +1,210 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _reversesequence_local_data_t {
+    int32_t placeholder;
+} reversesequence_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    uint32_t i = 0;
+    uint32_t new_rank = 3;
+    vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 };
+    int32_t batch_axis = (int32_t)self->nn_param.reversesequence.batch_axis;
+    int32_t time_axis  = (int32_t)self->nn_param.reversesequence.time_axis;
+
+    if (inputs[0]->attr.dim_num == 2)
+    {
+        shapes[0] = 1;
+        shapes[1] = inputs[0]->attr.size[0];
+        shapes[2] = inputs[0]->attr.size[1];
+    }
+    if (inputs[0]->attr.dim_num > 2)
+    {
+        shapes[2] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 1];
+        shapes[1] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 2];
+        for (i = 0;i < inputs[0]->attr.dim_num - 2; i++)
+        {
+            shapes[0] = shapes[0] * inputs[0]->attr.size[i];
+        }
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], shapes, new_rank );
+    reshape_tensors[1] = inputs[1];
+    reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], shapes, new_rank );
+
+    param = vsi_nn_kernel_param_create();
+    if (batch_axis == (int32_t)inputs[0]->attr.dim_num - 1)
+    {
+        batch_axis = 2;
+    }
+    else
+    {
+        batch_axis = 1;
+    }
+    if (time_axis == (int32_t)inputs[0]->attr.dim_num - 1)
+    {
+        time_axis = 2;
+    }
+    else
+    {
+        time_axis = 1;
+    }
+
+    vsi_nn_kernel_param_add_int32(param, "batch_axis", batch_axis);
+    vsi_nn_kernel_param_add_int32(param, "time_axis", time_axis);
+
+    self->n = (vx_node)vsi_nn_kernel_selector(self->graph,"reversesequence",
+        &reshape_tensors[0],_INPUT_NUM,&reshape_tensors[2],_OUTPUT_NUM,param);
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release(&param);
+    return status;
+} /* op_compute()  */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    int32_t batch_axis = (int32_t)self->nn_param.reversesequence.batch_axis;
+    int32_t time_axis = (int32_t)self->nn_param.reversesequence.time_axis;
+    BEGIN_IO_TYPE_DECL(REVERSESEQUENCE, 2, 1)
+        IO_TYPE(D_F32,        D_I32,   D_F32)
+        IO_TYPE(D_F16,        D_I32,   D_F16)
+        IO_TYPE(D_BF16,       D_I32,   D_BF16)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,   D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,   D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,   D_I8|Q_DFP)
+        IO_TYPE(D_F32,        D_I32,   D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_I32,   D_I16|Q_SYM)
+        IO_TYPE(D_F32,        D_I32,   D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I32,   D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,   D_F32)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,   D_F32)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,   D_F32)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,   D_F32)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,   D_F32)
+        IO_TYPE(D_F32,        D_I32,   D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I32,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,   D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,   D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,   D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,   D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,   D_F16)
+        IO_TYPE(D_F32,        D_I32,   D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_I32,   D_I8|Q_SYM)
+        IO_TYPE(D_F32,        D_I32,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_I32,   D_I8|Q_DFP)
+    END_IO_TYPE_DECL(REVERSESEQUENCE)
+
+    if (!VALIDATE_OP_IO_TYPES(
+            REVERSESEQUENCE, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+    if (inputs[0]->attr.dim_num < 2)
+    {
+        VSILOGE("inputs[0] dim should be greater than 2");
+        return FALSE;
+    }
+    if ((batch_axis != (int32_t)inputs[0]->attr.dim_num - 1 &&
+        batch_axis != (int32_t)inputs[0]->attr.dim_num - 2) ||
+        (time_axis != (int32_t)inputs[0]->attr.dim_num - 1 &&
+        time_axis != (int32_t)inputs[0]->attr.dim_num - 2))
+    {
+        VSILOGE("batch_axis must be inputs[0]->attr.dim_num - 1 \
+            of inputs[0]->attr.dim_num - 1, so do time_axis");
+        return FALSE;
+    }
+    if (inputs[1]->attr.size[0] != inputs[0]->attr.size[batch_axis])
+    {
+        VSILOGE("inputs[1] should have shape `[batch_size]`");
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ REVERSESEQUENCE,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ vsi_nn_op_common_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
index 282de4e..a5f8261 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
@@ -311,6 +311,8 @@ static vsi_bool op_setup
 
     /* activation */
     curr = vsi_nn_internal_new_node( self, vsi_nn_rnn_get_act_op_type(p->activation), 0, 0 );
+    curr->node->nn_param.tanh.scale_a = 1.0;
+    curr->node->nn_param.tanh.scale_b = 1.0;
     curr->inputs[0] = gate_fc_outputs->t;
     curr->outputs[0] = outputs[RNNCELL_OUTPUT_OUTPUT];
     vsi_nn_internal_setup_node(self, curr);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
index 49dbd7b..725ef90 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
@@ -54,12 +54,14 @@ static vsi_status op_compute
     float height_ratio = self->nn_param.roi_align.height_ratio;
     int32_t width_sample_num = self->nn_param.roi_align.width_sample_num;
     int32_t height_sample_num = self->nn_param.roi_align.height_sample_num;
+    int32_t platform_type = self->nn_param.roi_align.platform_type;
 
     param = vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_float32( param, "width_ratio",  width_ratio );
     vsi_nn_kernel_param_add_float32( param, "height_ratio",  height_ratio );
     vsi_nn_kernel_param_add_int32( param, "width_sample_num",  width_sample_num );
     vsi_nn_kernel_param_add_int32( param, "height_sample_num",  height_sample_num );
+    vsi_nn_kernel_param_add_int32( param, "platform_type",  platform_type );
 
     self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
         "roi_align",
@@ -121,6 +123,17 @@ static vsi_bool op_setup
     }
 
     return TRUE;
+}
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    self->nn_param.roi_align.platform_type = VSI_NN_ROI_ALIGN_ANDROID;
+
+    return status;
 } /* op_init() */
 
 
@@ -131,7 +144,7 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ ROI_ALIGN,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
     /* deinit     */ vsi_nn_op_common_deinit,
     /* check      */ op_check,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c
index e6ba3bf..4c36426 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c
@@ -35,6 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_tensor_util_prv.h"
 
 static vsi_status op_compute
     (
@@ -63,70 +64,133 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    BEGIN_IO_TYPE_DECL(RSQRT, 1, 1)
-        IO_TYPE(D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_BF16,         D_BF16)
-        IO_TYPE(D_BF16,         D_F32)
-        IO_TYPE(D_F32,          D_BF16)
-        IO_TYPE(D_F32,          D_F32)
-        IO_TYPE(D_F32,          D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_F32)
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
 
-        /* HW 9.0.1 */
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,    D_BF16)
-        IO_TYPE(D_U8|Q_ASYM,    D_F32)
-        IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+    if (!ret)
+    {
+        BEGIN_IO_TYPE_DECL(RSQRT, 1, 1)
+            IO_TYPE(D_F16,          D_F16)
+            IO_TYPE(D_F16,          D_I16|Q_DFP)
+            IO_TYPE(D_F16,          D_I16|Q_ASYM)
+            IO_TYPE(D_F16,          D_I16|Q_SYM)
+            IO_TYPE(D_F16,          D_I8|Q_DFP)
+            IO_TYPE(D_F16,          D_I8|Q_ASYM)
+            IO_TYPE(D_F16,          D_I8|Q_SYM)
+            IO_TYPE(D_F16,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,     D_F16)
+            IO_TYPE(D_I8|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_SYM,     D_F16)
+            IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_F16)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,    D_F16)
+            IO_TYPE(D_I16|Q_ASYM,   D_F16)
+            IO_TYPE(D_I16|Q_SYM,    D_F16)
+            IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+            IO_TYPE(D_BF16,         D_BF16)
+            IO_TYPE(D_BF16,         D_F32)
+            IO_TYPE(D_F32,          D_BF16)
+            IO_TYPE(D_F32,          D_F32)
+            IO_TYPE(D_F32,          D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_F32)
 
-        IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_BF16)
-        IO_TYPE(D_I8|Q_DFP,     D_F32)
+            /* HW 9.0.1 */
+            IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM,    D_BF16)
+            IO_TYPE(D_U8|Q_ASYM,    D_F32)
+            IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
 
-        IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,    D_BF16)
-        IO_TYPE(D_I16|Q_DFP,    D_F32)
+            IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,     D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,     D_BF16)
+            IO_TYPE(D_I8|Q_DFP,     D_F32)
 
-        IO_TYPE(D_F16,          D_BF16)
-        IO_TYPE(D_F16,          D_F32)
+            IO_TYPE(D_I16|Q_DFP,    D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,    D_I8|Q_DFP)
+            IO_TYPE(D_I16|Q_DFP,    D_BF16)
+            IO_TYPE(D_I16|Q_DFP,    D_F32)
 
-        /* HW 9.1.1 */
-        IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
-        IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
-        IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
-        IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_F16,          D_BF16)
+            IO_TYPE(D_F16,          D_F32)
 
-    END_IO_TYPE_DECL(RSQRT)
-    if(!VALIDATE_OP_IO_TYPES(RSQRT, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
+            /* HW 9.1.1 */
+            IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,     D_U8|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,     D_I8|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_DFP)
+            IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_DFP)
+            IO_TYPE(D_I4|Q_SYM,     D_I8|Q_DFP)
+            IO_TYPE(D_U4|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,     D_I16|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I16|Q_SYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_I16|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,     D_I16|Q_SYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I16|Q_DFP)
+            IO_TYPE(D_I4|Q_ASYM,    D_I16|Q_DFP)
+            IO_TYPE(D_I4|Q_SYM,     D_I16|Q_DFP)
+            IO_TYPE(D_U4|Q_ASYM,    D_F16)
+            IO_TYPE(D_I4|Q_ASYM,    D_F16)
+            IO_TYPE(D_I4|Q_SYM,     D_F16)
+            IO_TYPE(D_U4|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I4|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I4|Q_SYM,     D_BF16)
+
+            IO_TYPE(D_U8|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I4|Q_SYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_I4|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,     D_U4|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I4|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I4|Q_SYM)
+            IO_TYPE(D_I8|Q_DFP,     D_U4|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,     D_I4|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,     D_I4|Q_SYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_U4|Q_ASYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_I4|Q_ASYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_I4|Q_SYM)
+            IO_TYPE(D_I16|Q_SYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I4|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,    D_U4|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,    D_I4|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,    D_I4|Q_SYM)
+            IO_TYPE(D_F16,          D_U4|Q_ASYM)
+            IO_TYPE(D_F16,          D_I4|Q_ASYM)
+            IO_TYPE(D_F16,          D_I4|Q_SYM)
+            IO_TYPE(D_BF16,         D_U4|Q_ASYM)
+            IO_TYPE(D_BF16,         D_I4|Q_ASYM)
+            IO_TYPE(D_BF16,         D_I4|Q_SYM)
+            IO_TYPE(D_I32,          D_I32)
+
+        END_IO_TYPE_DECL(RSQRT)
+        if (!VALIDATE_OP_IO_TYPES(RSQRT, self, inputs, self->input.num, outputs, self->output.num))
+        {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
     }
+
     return TRUE;
 } /* op_check() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
index 14b3250..d8c9842 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c
@@ -58,11 +58,6 @@ static vsi_status op_compute
     {
         coord_dim = (uint32_t)inputs[0]->attr.size[0];
     }
-    if ( coord_dim > 3 )
-    {
-        CHECK_STATUS(status);
-        return status;
-    }
     for (i = 0; i < inputs[0]->attr.dim_num; i++)
     {
         idx_num *= (uint32_t)inputs[0]->attr.size[i];
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
index 384cb7f..63900eb 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
@@ -58,18 +58,13 @@ static vsi_status op_compute
     {
         coord_dim = (uint32_t)inputs[1]->attr.size[0];
     }
-    if ( coord_dim > 4 && input_size[dims_num - 1] > 1)
-    {
-        CHECK_STATUS(status);
-        return status;
-    }
     for (i = 0; i < inputs[1]->attr.dim_num; i++)
     {
         idx_num *= (uint32_t)inputs[1]->attr.size[i];
     }
     idx_num /= coord_dim;
 
-    param =vsi_nn_kernel_param_create();
+    param = vsi_nn_kernel_param_create();
 
     for (i = 0; i < dims_num; ++i)
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
index 94e0110..485dcd5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
@@ -55,13 +55,16 @@ static vsi_status op_compute
     vsi_size_t rank_in[_INPUT_NUM];
     uint32_t new_rank = 0;
     int32_t  i        = 0;
-    vsi_bool ret;
+    vsi_bool ret = FALSE;
+    vsi_nn_context_t ctx = NULL;
 
-    if( NULL == self )
+    if ( NULL == self )
     {
         return VSI_FAILURE;
     }
 
+    ctx = self->graph->ctx;
+
     for (i = 0; i < _IO_NUM; i++)
     {
         shapes_ptr[i] = shapes[i];
@@ -78,7 +81,7 @@ static vsi_status op_compute
             outputs[0]->attr.size, outputs[0]->attr.dim_num,
             shapes_ptr, shapes[_INPUT_NUM], &new_rank);
 
-    if( ret )
+    if ( ret && !ctx->config.support_stream_processor )
     {
         for (i = 0; i < _INPUT_NUM; i++)
         {
@@ -98,11 +101,17 @@ static vsi_status op_compute
 
         for (i = 0; i < _IO_NUM; i++)
         {
-            vsi_nn_ReleaseTensor( &reshape_tensors[i] );
+            vsi_safe_release_tensor( reshape_tensors[i] );
         }
     }
+    else
+    {
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select",
+                                        inputs, _INPUT_NUM,
+                                        outputs, _OUTPUT_NUM, NULL );
+    }
 
-    if( self->n )
+    if ( self->n )
     {
         status = VSI_SUCCESS;
     }
@@ -243,7 +252,7 @@ static vsi_bool op_setup
     in2_rank = inputs[2]->attr.dim_num;
     out_rank = vsi_nn_max(in0_rank, vsi_nn_max( in1_rank, in2_rank ));
 
-    for(i = 0; i < out_rank; i++)
+    for (i = 0; i < out_rank; i++)
     {
         vsi_size_t sz0, sz1, sz2;
         sz0 = i < in0_rank ? inputs[0]->attr.size[i] : 1;
@@ -252,7 +261,7 @@ static vsi_bool op_setup
         shape[i] = vsi_nn_max(vsi_nn_max(sz0, sz1), sz2);
     }
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = (uint32_t)out_rank;
         memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
@@ -264,7 +273,7 @@ static vsi_bool op_setup
         total_size_expected = vsi_nn_ShapeProduct( shape, out_rank );
         total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num );
-        if( total_size_expected != total_size_got )
+        if ( total_size_expected != total_size_got )
         {
             VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"",
                     total_size_expected, total_size_got);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
index b8cd921..c816399 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
@@ -36,6 +36,7 @@
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
 
 static vsi_status op_compute
     (
@@ -54,55 +55,60 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    /* check inputs outputs data type */
-    BEGIN_IO_TYPE_DECL(SOFTMAX, 1, 1)
-        /* IO_TYPE(INPUT, OUTPUT) */
-        IO_TYPE(D_F32,          D_F32)
-        IO_TYPE(D_F32,          D_F16)
-        IO_TYPE(D_F16,          D_F16)
-        IO_TYPE(D_F16,          D_F32)
-        IO_TYPE(D_F16,          D_I16|Q_DFP)
-        IO_TYPE(D_F16,          D_I16|Q_ASYM)
-        IO_TYPE(D_F16,          D_I16|Q_SYM)
-        IO_TYPE(D_F16,          D_I8|Q_DFP)
-        IO_TYPE(D_F16,          D_I8|Q_SYM)
-        IO_TYPE(D_F16,          D_I8|Q_ASYM)
-        IO_TYPE(D_F16,          D_U8|Q_ASYM)
-        IO_TYPE(D_BF16,         D_BF16)
-        IO_TYPE(D_BF16,         D_F32)
-        IO_TYPE(D_BF16,         D_F16)
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
 
-        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,    D_F16)
-        IO_TYPE(D_U8|Q_ASYM,    D_F32)
+    if (!ret)
+    {
+        /* check inputs outputs data type */
+        BEGIN_IO_TYPE_DECL(SOFTMAX, 1, 1)
+            /* IO_TYPE(INPUT, OUTPUT) */
+            IO_TYPE(D_F32,          D_F32)
+            IO_TYPE(D_F32,          D_F16)
+            IO_TYPE(D_F16,          D_F16)
+            IO_TYPE(D_F16,          D_F32)
+            IO_TYPE(D_F16,          D_I16|Q_DFP)
+            IO_TYPE(D_F16,          D_I16|Q_ASYM)
+            IO_TYPE(D_F16,          D_I16|Q_SYM)
+            IO_TYPE(D_F16,          D_I8|Q_DFP)
+            IO_TYPE(D_F16,          D_I8|Q_SYM)
+            IO_TYPE(D_F16,          D_I8|Q_ASYM)
+            IO_TYPE(D_F16,          D_U8|Q_ASYM)
+            IO_TYPE(D_BF16,         D_BF16)
+            IO_TYPE(D_BF16,         D_F32)
+            IO_TYPE(D_BF16,         D_F16)
 
-        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
-        IO_TYPE(D_I8|Q_ASYM,    D_F16)
-        IO_TYPE(D_I8|Q_SYM,     D_F16)
-        IO_TYPE(D_I8|Q_ASYM,    D_F32)
-        IO_TYPE(D_I8|Q_SYM,     D_F32)
-        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,     D_F16)
-        IO_TYPE(D_I8|Q_DFP,     D_F32)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_F16)
+            IO_TYPE(D_U8|Q_ASYM,    D_F32)
+
+            IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_SYM,     D_F16)
+            IO_TYPE(D_I8|Q_ASYM,    D_F32)
+            IO_TYPE(D_I8|Q_SYM,     D_F32)
+            IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,     D_F16)
+            IO_TYPE(D_I8|Q_DFP,     D_F32)
 
 
-        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
-        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
-        IO_TYPE(D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_I16|Q_SYM,    D_F16)
-        IO_TYPE(D_I16|Q_ASYM,   D_F32)
-        IO_TYPE(D_I16|Q_SYM,    D_F32)
-        IO_TYPE(D_I16|Q_DFP,    D_F32)
-        IO_TYPE(D_I16|Q_DFP,    D_F16)
-        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
-    END_IO_TYPE_DECL(SOFTMAX)
-    if (!VALIDATE_OP_IO_TYPES(SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
+            IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_F16)
+            IO_TYPE(D_I16|Q_SYM,    D_F16)
+            IO_TYPE(D_I16|Q_ASYM,   D_F32)
+            IO_TYPE(D_I16|Q_SYM,    D_F32)
+            IO_TYPE(D_I16|Q_DFP,    D_F32)
+            IO_TYPE(D_I16|Q_DFP,    D_F16)
+            IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+        END_IO_TYPE_DECL(SOFTMAX)
+        if (!VALIDATE_OP_IO_TYPES(SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
     }
 
     return TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
index 81a0afd..0dbe88c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
@@ -32,9 +32,9 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_log.h"
+#include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_link_list.h"
-#include "kernel/vsi_nn_kernel.h"
 
 #define MAX_SOFTMAX_BATCH 65520
 
@@ -150,7 +150,7 @@ vsi_status op_compute
         self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
                 "softmax",
                 inputs, 1,
-                outputs, 1, kernel_param );;
+                outputs, 1, kernel_param );
 
         if( NULL != self->n )
         {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
index 1dbe3ca..9b59d99 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
@@ -83,8 +83,9 @@ static vsi_bool op_setup
     vsi_size_t output_shape[2] = {1, 1};
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_tensor_t *output_rs = NULL;
-    vsi_nn_stack_lcl_data * data;
+    vsi_nn_stack_lcl_data * data = NULL;
     vsi_bool ret = TRUE;
+    vx_int8 is_scalar = vsi_nn_GetTensorIsScalar(inputs[0]);
 
     vsi_nn_internal_init_node_wksp( node );
 
@@ -101,11 +102,11 @@ static vsi_bool op_setup
         block_num *= inputs[0]->attr.size[i];
     }
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
-        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + 1;
+        outputs[0]->attr.dim_num = is_scalar ? 1 : inputs[0]->attr.dim_num + 1;
 
-        for(i = 0, j = 0; j < outputs[0]->attr.dim_num; j++)
+        for (i = 0, j = 0; j < outputs[0]->attr.dim_num; j++)
         {
             if (j == p->axis)
             {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index aa22120..1cf2891 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -243,7 +243,9 @@ static vsi_status copy_tensor_to_view
     (
     vsi_nn_node_t   * self,
     vx_tensor         src_tensor,
-    vsi_nn_tensor_t * dst_in
+    vsi_nn_tensor_t * dst_in,
+    vsi_size_t      * shape,
+    vsi_bool          is_same_shape
     )
 {
     vsi_status ret;
@@ -253,10 +255,15 @@ static vsi_status copy_tensor_to_view
     /* Malloc ptr */
     data = self->nn_param.strided_slice.lcl2_data;
     data->src_tensor = src_tensor;
-    if (dst_in->t)
+    data->is_same_shape = is_same_shape;
+    if (dst_in->t && !is_same_shape)
     {
-        data->dst_tensor = vsi_nn_safe_reshape_tensor(dst_in->t, (void*)dst_in->attr.size,
-            (vsi_size_t)dst_in->attr.dim_num, sizeof(dst_in->attr.size[0]));
+        data->dst_tensor = vsi_nn_safe_reshape_tensor(dst_in->t, (void*)shape,
+            (vsi_size_t)dst_in->attr.dim_num, sizeof(shape[0]));
+    }
+    else if (dst_in->t)
+    {
+        data->dst_tensor = dst_in->t;
     }
 
     data->is_dataconvert_op = TRUE;
@@ -734,24 +741,33 @@ static vsi_status op_optimize
     int32_t        i = 0;
     vx_tensor      in_view_tensor = NULL;
     vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice);
-    vsi_size_t       start[VSI_NN_MAX_DIM_NUM] = { 0 };
-    vsi_size_t       end[VSI_NN_MAX_DIM_NUM] = { 0 };
-    vsi_ssize_t        start_dims[VSI_NN_MAX_DIM_NUM] = { 0 };
-    vsi_ssize_t        stop_dims[VSI_NN_MAX_DIM_NUM] = { 0 };
-    vsi_ssize_t        stride_dims[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t     start[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t     end[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_ssize_t    start_dims[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_ssize_t    stop_dims[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_ssize_t    stride_dims[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t     shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_bool       is_same_quant_type = FALSE;
+    vsi_bool       is_same_shape = TRUE;
 
     /* Only forward run stride_slice's optimize */
-    if( direction == VSI_NN_OPTIMIZE_BACKWARD )
+    if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
     {
         return status;
     }
 
-    for(i = 0; i< VSI_NN_MAX_DIM_NUM; i++)
+    for (i = 0; i< VSI_NN_MAX_DIM_NUM; i++)
     {
         start_dims[i] = p->lcl2_data->begin_dims[i];
         stop_dims[i] = p->lcl2_data->end_dims[i];
         stride_dims[i] = p->lcl2_data->stride_dims[i];
+
+        shape[i] = (vsi_size_t)stop_dims[i] - (vsi_size_t)start_dims[i];
+        if (shape[i] != outputs[0]->attr.size[i] &&
+            i < (int32_t)outputs[0]->attr.dim_num)
+        {
+            is_same_shape = FALSE;
+        }
     }
 
     if (_check_is_same_shape(inputs, start_dims, stop_dims, stride_dims) == FALSE)
@@ -782,7 +798,7 @@ static vsi_status op_optimize
     {
         VSILOGI( "stride slice copy tensor.");
         // Copy old tensor values to the new address.
-        status = copy_tensor_to_view( self, in_view_tensor, outputs[0]);
+        status = copy_tensor_to_view( self, in_view_tensor, outputs[0], shape, is_same_shape);
         if ( VSI_FAILURE == status )
         {
             goto OnError;
@@ -835,7 +851,7 @@ static vsi_status op_deinit
         vxReleaseTensor( &lcl2_data->src_tensor );
     }
 
-    if (lcl2_data->dst_tensor)
+    if (lcl2_data->dst_tensor && !lcl2_data->is_same_shape)
     {
         vxReleaseTensor( &lcl2_data->dst_tensor );
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
index e1e2615..d797af2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@@ -73,6 +73,8 @@ static vsi_bool op_check
 {
     BEGIN_IO_TYPE_DECL(TOPK, _INPUT_NUM, _OUTPUT_NUM)
         IO_TYPE(D_F16,        D_F16,        D_I32)
+        IO_TYPE(D_F16,        D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_F16,        D_I16|Q_DFP,  D_I32)
         IO_TYPE(D_F32,        D_F32,        D_I32)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_I32)
         IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,  D_I32)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
index 6c04ec6..a6d5266 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
@@ -244,7 +244,7 @@ static vsi_bool op_setup
                     curr_param->internal_dtype[k] = reshape_output->attr.dtype;
                 }
             }
-        }        
+        }
         memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype,
             curr_param->internal_dtype, sizeof( curr_param->internal_dtype ) );
         curr->inputs[RNNCELL_INPUT_INPUT] = reshape_output;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
index 676326b..7e57e32 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
@@ -91,58 +91,63 @@ static vsi_bool op_setup
     vsi_size_t block_size = 1;
     vsi_size_t block_num = 1;
     uint32_t axis = 0;
-    uint32_t i, j;
+    uint32_t i = 0, j = 0;
+    uint32_t rank = inputs[0]->attr.dim_num;
+    int8_t is_scalar = (rank - 1) == 0 ? TRUE : FALSE;
 
     vsi_nn_internal_init_node_wksp( self );
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         p = (vsi_nn_unstack_param *)&(self->nn_param.unstack);
-        if(p->axis == 0)
+
+        if (p->axis == 0)
         {
-            for(i = 0; i < inputs[0]->attr.dim_num - 1; i++)
+            for (j = 0; j < self->output.num; j++)
             {
-                for(j = 0; j < self->output.num; j++)
+                for (i = 0; i < inputs[0]->attr.dim_num - 1; i++)
                 {
                     outputs[j]->attr.size[i] = inputs[0]->attr.size[i + 1];
                 }
+                outputs[j]->attr.size[0] = is_scalar ? 1 : outputs[j]->attr.size[0];
             }
 
-            for(j = 0; j < self->output.num; j++)
+            for (j = 0; j < self->output.num; j++)
             {
-                outputs[j]->attr.dim_num = inputs[0]->attr.dim_num - 1;
+                outputs[j]->attr.dim_num = is_scalar ? 1 : (rank - 1);
+                vsi_nn_SetTensorIsScalar(outputs[j], is_scalar);
             }
         }
-        else if(p->axis == 1)
+        else if (p->axis == 1)
         {
-            for(j = 0; j < self->output.num; j++)
+            for (j = 0; j < self->output.num; j++)
             {
                 outputs[j]->attr.size[0] = inputs[0]->attr.size[0];
 
-                for(i = 1; i < inputs[0]->attr.dim_num-1; i++)
+                for (i = 1; i < inputs[0]->attr.dim_num-1; i++)
                 {
                     outputs[j]->attr.size[i] = inputs[0]->attr.size[i + 1];
                 }
                 outputs[j]->attr.dim_num = inputs[0]->attr.dim_num - 1;
             }
         }
-        else if(p->axis == 2)
+        else if (p->axis == 2)
         {
-            for(j = 0; j < self->output.num; j++)
+            for (j = 0; j < self->output.num; j++)
             {
                 outputs[j]->attr.size[0] = inputs[0]->attr.size[0];
                 outputs[j]->attr.size[1] = inputs[0]->attr.size[1];
 
-                for(i = 2; i < inputs[0]->attr.dim_num - 1; i++)
+                for (i = 2; i < inputs[0]->attr.dim_num - 1; i++)
                 {
                     outputs[j]->attr.size[i] = inputs[0]->attr.size[i + 1];
                 }
                 outputs[j]->attr.dim_num = inputs[0]->attr.dim_num - 1;
             }
         }
-        else if(p->axis == 3)
+        else if (p->axis == 3)
         {
-            for(j = 0; j < self->output.num; j++)
+            for (j = 0; j < self->output.num; j++)
             {
                 outputs[j]->attr.size[0] = inputs[0]->attr.size[0];
                 outputs[j]->attr.size[1] = inputs[0]->attr.size[1];
diff --git a/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c b/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c
new file mode 100644
index 0000000..7d1b9cf
--- /dev/null
+++ b/src/tim/vx/internal/src/post/vsi_nn_post_cmupose.c
@@ -0,0 +1,2046 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "vsi_nn_context.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_node_attr_template.h"
+#include "vsi_nn_log.h"
+#include "utils/vsi_nn_math.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_util.h"
+#include "post/vsi_nn_post_cmupose.h"
+#include "vsi_nn_error.h"
+
+static const int32_t limbSeq[19][2] = {{2,3},   {2,6},   {3,4},  {4,5},   {6,7},
+                                       {7,8},   {2,9},   {9,10}, {10,11}, {2,12},
+                                       {12,13}, {13,14}, {2,1},  {1,15},  {15,17},
+                                       {1,16},  {16,18}, {3,17}, {6,18}};
+static const int32_t mapIdx[19][2] = {{31,32}, {39,40}, {33,34}, {35,36}, {41,42},
+                                      {43,44}, {19,20}, {21,22}, {23,24}, {25,26},
+                                      {27,28}, {29,30}, {47,48}, {49,50}, {53,54},
+                                      {51,52}, {55,56}, {37,38}, {45,46}};
+uint32_t peak_id = 0;
+
+// this is a temp test function
+#if 0
+static void savetxt(char *filename, float *buffer, uint32_t sz)
+{
+#define _MAX_BUFFER_SZ  (512)
+    FILE * fp;
+    const float c_flush_th = 0.7f;
+    uint8_t buf[_MAX_BUFFER_SZ];
+    uint32_t count,i;
+
+    count = 0;
+    fp = vsi_nn_fopen( filename, "w" );
+    for( i = 0; i < sz; i ++ )
+    {
+        count += snprintf( (char *)&buf[count], _MAX_BUFFER_SZ - count,
+            "%f\n", buffer[i]);
+        if( ((float)count / _MAX_BUFFER_SZ) > c_flush_th )
+        {
+            fwrite( buf, count, 1, fp );
+            count = 0;
+        }
+    }
+
+    fwrite( buf, count, 1, fp );
+    fclose( fp );
+}
+#endif
+
+static void _init_subset(vsi_nn_link_list_t *node)
+{
+    vsi_nn_subset_t *ptr = NULL;
+    ptr = (vsi_nn_subset_t *)node;
+    ptr->link_list.next = NULL;
+    ptr->link_list.prev = NULL;
+    memset(&ptr->data.idx, 0, sizeof(float) * 20);
+}
+
+static void _init_candidate(vsi_nn_link_list_t *node)
+{
+    vsi_nn_con_candidate_t *ptr = NULL;
+    ptr = (vsi_nn_con_candidate_t *)node;
+    ptr->link_list.next = NULL;
+    ptr->link_list.prev = NULL;
+    memset(&ptr->data, 0, sizeof(vsi_nn_con_candidate_data_t));
+}
+
+static void _init_connection(vsi_nn_link_list_t *node)
+{
+    vsi_nn_connection_t *ptr = NULL;
+    ptr = (vsi_nn_connection_t *)node;
+    ptr->link_list.next = NULL;
+    ptr->link_list.prev = NULL;
+    memset(&ptr->data, 0, sizeof(vsi_nn_connection_data_t));
+}
+
+static void _init_peak(vsi_nn_link_list_t *node)
+{
+    vsi_nn_peaks_t *box = NULL;
+    box = (vsi_nn_peaks_t *)node;
+    box->link_list.next = NULL;
+    box->link_list.prev = NULL;
+    memset(&box->peak, 0, sizeof(vsi_nn_peaks_data_t));
+}
+
+static vsi_status _cmupose_init_multiplier
+    (
+    vsi_nn_cmupose_config_t *config,
+    vsi_nn_cmupose_multiplier_t *multiplier
+    )
+{
+    uint32_t i,num;
+    uint32_t boxsize,width;
+    vsi_status status = VSI_FAILURE;
+
+    if(NULL == config || NULL == multiplier)
+    {
+        return VSI_FAILURE;
+    }
+
+    num = config->param.scale_search.num;
+    multiplier->size = (float *)malloc(num * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( multiplier->size, "Create buffer fail.", final );
+    status = VSI_SUCCESS;
+    multiplier->num = num;
+
+    boxsize = config->model.boxsize;
+    width = config->image.width;
+    memset(multiplier->size, 0, sizeof(float) * num);
+    for(i=0; i<num; i++)
+    {
+        float x = config->param.scale_search.size[i];
+        multiplier->size[i] = x * boxsize / width;
+    }
+
+final:
+
+    return status;
+}
+
+static vsi_status _cmupose_init_heatmap_avg
+    (
+    vsi_nn_cmupose_config_t *config,
+    float **heatmap_avg
+    )
+{
+    uint32_t w,h,channel;
+    vsi_size_t sz;
+    vsi_status status = VSI_FAILURE;
+
+    if(NULL == config || NULL == heatmap_avg)
+    {
+        return VSI_FAILURE;
+    }
+
+#define VSI_NN_POST_CMUPOSE_DEF_HEATMAP 19
+    channel = VSI_NN_POST_CMUPOSE_DEF_HEATMAP;
+    w = config->image.width;
+    h = config->image.height;
+    sz = channel * w * h;
+    *heatmap_avg = (float *)malloc(sizeof(float) * sz);
+    CHECK_PTR_FAIL_GOTO( *heatmap_avg, "Create buffer fail.", final );
+    status = VSI_SUCCESS;
+    memset(*heatmap_avg, 0, sizeof(float) * sz);
+final:
+
+    return status;
+}
+
+static vsi_status _cmupose_init_paf_avg
+    (
+    vsi_nn_cmupose_config_t *config,
+    float **paf_avg
+    )
+{
+    uint32_t w,h,channel;
+    vsi_size_t sz;
+    vsi_status status = VSI_FAILURE;
+
+    if(NULL == config || NULL == paf_avg)
+    {
+        return VSI_FAILURE;
+    }
+
+#define VSI_NN_POST_CMUPOSE_DEF_PAF 38
+    channel = VSI_NN_POST_CMUPOSE_DEF_PAF;
+    w = config->image.width;
+    h = config->image.height;
+    sz = channel * w * h;
+    *paf_avg = (float *)malloc(sizeof(float) * sz);
+    CHECK_PTR_FAIL_GOTO( *paf_avg, "Create buffer fail.", final );
+    status = VSI_SUCCESS;
+    memset(*paf_avg, 0, sizeof(float) * sz);
+
+final:
+
+    return status;
+}
+
+static void _cmupose_deinit
+    (
+    vsi_nn_cmupose_multiplier_t *multiplier,
+    float *heatmap_avg,
+    float *paf_avg
+    )
+{
+    if(multiplier->size)free(multiplier->size);
+    if(heatmap_avg)free(heatmap_avg);
+    if(paf_avg)free(paf_avg);
+}
+
+static vsi_status _cmupose_init
+    (
+    vsi_nn_cmupose_config_t *config,
+    vsi_nn_cmupose_multiplier_t *multiplier,
+    float **heatmap_avg,
+    float **paf_avg
+    )
+{
+    vsi_status status = VSI_FAILURE;
+
+    if(NULL == config || NULL == multiplier
+       || NULL == heatmap_avg || NULL == paf_avg)
+    {
+        return status;
+    }
+
+    status = _cmupose_init_multiplier(config, multiplier);
+    if(VSI_SUCCESS != status)
+    {
+        goto error;
+    }
+    status = _cmupose_init_heatmap_avg(config, heatmap_avg);
+    if(VSI_SUCCESS != status)
+    {
+        goto error;
+    }
+    status = _cmupose_init_paf_avg(config, paf_avg);
+    if(VSI_SUCCESS != status)
+    {
+        goto error;
+    }
+
+    return status;
+error:
+    _cmupose_deinit(multiplier, *heatmap_avg, *paf_avg);
+    return status;
+}
+
+#if 0
+static vx_status resize_binlinear
+    (
+    float *src,
+    uint32_t *src_size,
+    float *dst,
+    uint32_t *dst_size
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    uint32_t i,j,n,k;
+    float *src_ptr = NULL, *dst_ptr = NULL;
+    uint32_t src_w, src_h, dst_w, dst_h;
+    float xRatio, yRatio;
+    uint32_t LineSize,GrayStride;
+    uint32_t index00,index01,index10,index11;
+    float value00,value01,value10,value11;
+    double temp;
+
+    src_ptr = src;
+    dst_ptr = dst;
+
+    src_w = src_size[0];
+    src_h = src_size[1];
+    dst_w = dst_size[0];
+    dst_h = dst_size[1];
+
+    xRatio = (float)src_w / dst_w;
+    yRatio = (float)src_h / dst_h;
+
+    GrayStride = 19; /* in gray, channel = 1*/
+    LineSize = src_w * GrayStride;
+
+    n = 1;
+    for(i = 0; i < dst_h; i++)
+    {
+        float srcY = i * yRatio;
+        uint32_t IntY = (uint32_t)srcY;
+        float v = srcY - IntY;
+        float v1 = 1.f - v;
+
+        for(j = 0; j < dst_w; j++)
+        {
+            float srcX = j * xRatio;
+            uint32_t IntX = (uint32_t)srcX;
+            float u = srcX - IntX;
+            float u1 = 1.f - u;
+
+            /*
+                index00 -------- index01
+                  |                 |
+                  |    index(x,y)   |
+                  |                 |
+                  |                 |
+                index10 -------- index11
+            */
+            index00 = IntY * LineSize + IntX * GrayStride;
+            index10;
+            if(IntY < src_h - 1)
+                index10 = index00 + LineSize;
+            else
+                index10 = index00;
+
+            index01,index11;
+            if(IntX < src_w)
+            {
+                index01 = index00 + GrayStride;
+                index11 = index10 + GrayStride;
+            }
+            else
+            {
+                index01 = index00;
+                index11 = index10;
+            }
+
+            for(k=0; k<19; k++)
+            {
+                value00 = src_ptr[index00 + k];
+                value01 = src_ptr[index01 + k];
+                value10 = src_ptr[index10 + k];
+                value11 = src_ptr[index11 + k];
+                temp = v1 * (u * value01 + u1 * value00) + v * (u * value11 + u1 * value10);
+
+                *dst_ptr = (float)temp;
+                dst_ptr++;
+            }
+
+            n++;
+        }
+    }
+
+    return VX_SUCCESS;
+}
+#endif
+
+static vx_status resize_nearest
+    (
+    float *src,
+    uint32_t *src_size,
+    float *dst,
+    uint32_t *dst_size
+    )
+{
+    uint32_t w = 0, h = 0;
+
+    uint32_t output_depth = dst_size[2];
+    uint32_t output_height = dst_size[1];
+    uint32_t output_width = dst_size[0];
+
+    uint32_t input_depth = src_size[2];
+    uint32_t input_height = src_size[1];
+    uint32_t input_width = src_size[0];
+
+    float width_scale = (input_width * 1.0f)/output_width;
+    float height_scale = (input_height * 1.0f)/output_height;
+
+    uint32_t depthf = output_depth*sizeof(float);
+    uint32_t stride_out = output_depth * output_width;
+    uint32_t stride_in  = input_depth * input_width;
+
+    for(h = 0; h < output_height; h ++)
+    {
+        uint32_t in_y = vsi_nn_min((uint32_t)(h * height_scale), input_height - 1);
+        for (w = 0; w < output_width; w ++)
+        {
+            uint32_t in_x = vsi_nn_min((uint32_t)(w * width_scale), input_width - 1);
+
+            uint32_t index_out,index_in;
+            index_out = stride_out * h + output_depth * w;
+            index_in = stride_in * in_y + input_depth * in_x;
+
+            memcpy(dst + index_out, src + index_in, depthf);
+        }
+    }
+
+    return VSI_SUCCESS;
+}
+
+static double *create_gaussian_kernel
+    (
+    float sigma,
+    int32_t *size
+    )
+{
+    double *kernel = NULL;
+    int32_t ksz,i;
+    double sum,scale2X;
+    static double kernel_25[25] = {0.000045, 0.000160, 0.000514, 0.001477, 0.003799,
+                                  0.008741, 0.017997, 0.033160, 0.054672, 0.080659,
+                                  0.106486, 0.125798, 0.132985, 0.125798, 0.106486,
+                                  0.080659, 0.054672, 0.033160, 0.017997, 0.008741,
+                                  0.003799, 0.001477, 0.000514, 0.000160, 0.000045};
+
+    ksz = vsi_nn_max(1, ((int32_t)(4.0 * sigma + 1.0 - 1e-8))) * 2 + 1;
+    kernel = (double *)malloc(sizeof(double) * ksz);
+    CHECK_PTR_FAIL_GOTO( kernel, "Create buffer fail.", final );
+    memset(kernel, 0, sizeof(double) * ksz);
+
+    if(ksz == 25)
+    {
+        memcpy(kernel, kernel_25, sizeof(double) * ksz);
+        goto final;
+    }
+
+    sum = 0.f;
+    scale2X = -0.5 / (sigma * sigma);
+    for(i=0; i<ksz; i++)
+    {
+        double x = i - (ksz - 1) * 0.5;
+        double t = exp(scale2X * x * x);
+        kernel[i] = t;
+        sum += kernel[i];
+    }
+
+    sum = 1./sum;
+    for(i=0; i<ksz; i++)
+    {
+        kernel[i] = kernel[i] * sum;
+    }
+
+#if 0
+    for(i=0; i<ksz; i++)
+    {
+        printf("kernel[%u] = %lf\n", i, kernel[i]);
+    }
+#endif
+
+final:
+    *size = ksz;
+    return kernel;
+}
+
+// only support convolve1d = 'same'
+static void _convolve_same
+    (
+    float *input,
+    uint32_t input_size,
+    double *kernel,
+    uint32_t kernel_size,
+    float *output
+    )
+{
+    uint32_t pad,pad_input_size;
+    uint32_t i,k,offset;
+    float *pad_input = NULL;
+    double sum;
+
+    uint32_t pad_input_sizef,input_sizef;
+    if(NULL == input || NULL == kernel || NULL == output)
+    {
+        return ;
+    }
+
+    // init input pad
+    pad = (kernel_size - 1) / 2;
+    pad_input_size = 2 * pad + input_size;
+    pad_input_sizef = sizeof(float) * pad_input_size;
+    input_sizef = input_size * sizeof(float);
+    pad_input = (float *)malloc(pad_input_sizef);
+    CHECK_PTR_FAIL_GOTO( pad_input, "Create buffer fail.", final );
+    memset(pad_input, 0, pad_input_sizef);
+    memcpy(pad_input + pad, input, input_sizef);
+
+    // init output buffer
+    memset(output, 0, input_sizef);
+
+    //compute convolve
+    offset = 0;
+    for(i=0; i<input_size; i++)
+    {
+        offset = i;
+        sum = 0.f;
+        for(k=0; k<kernel_size; k++)
+        {
+            double t_i = pad_input[offset++];
+            double t_k = kernel[k];
+            sum += t_i * t_k;
+        }
+        output[i] = (float)sum;
+    }
+
+final:
+    vsi_nn_safe_free(pad_input);
+}
+
+static void get_cols
+    (
+    float *inputs,
+    uint32_t height,
+    uint32_t width,
+    uint32_t cols_index,
+    float *output
+    )
+{
+    uint32_t w;
+
+    if(NULL == inputs || NULL == output)
+    {
+        return ;
+    }
+
+    memset(output, 0, sizeof(sizeof(float) * height));
+
+    for(w=0; w<width; w++)
+    {
+        output[w] = inputs[w * width + cols_index];
+    }
+}
+
+static void set_cols
+    (
+    float *data,
+    float *cols,
+    uint32_t height,
+    uint32_t width,
+    uint32_t cols_index
+    )
+{
+    uint32_t w;
+    if(NULL == data || cols == NULL)
+    {
+        return ;
+    }
+
+    for(w=0; w<width; w++)
+    {
+        data[w * width + cols_index] = cols[w];
+    }
+}
+
+static vsi_status gaussian_filter
+    (
+    float *inputs,
+    float sigma,
+    vsi_nn_cmupose_config_t *config,
+    float *output
+    )
+{
+    double *kernel = NULL;
+    float *temp = NULL,*rows = NULL,*cols = NULL;
+    float *conv_buffer = NULL, *conv_buffer1 = NULL;
+    int32_t ksz;
+    uint32_t w,h,i;
+
+    uint32_t szwf,szhf;
+    vsi_size_t sz,szf;
+
+    kernel = NULL, ksz = 0;
+    kernel = create_gaussian_kernel(sigma, &ksz);
+    CHECK_PTR_FAIL_GOTO( kernel, "Create buffer fail.", final );
+
+    w = config->image.width;
+    h = config->image.height;
+    sz = w * h;
+    szf = sizeof(float) * sz;
+    szwf = sizeof(float) * w;
+    szhf = sizeof(float) * h;
+    temp = (float *)malloc(szf);
+    CHECK_PTR_FAIL_GOTO( temp, "Create buffer fail.", final );
+    memset(temp, 0, szf);
+
+    rows = NULL;
+    conv_buffer = (float *)malloc(szwf);
+    CHECK_PTR_FAIL_GOTO( conv_buffer, "Create buffer fail.", final );
+    for(i=0; i<h; i++)
+    {
+        rows = inputs + i * w;
+        _convolve_same(rows, w, kernel, ksz, conv_buffer);
+        memcpy(temp + i * w, conv_buffer, szwf);
+    }
+
+    conv_buffer1 = (float *)malloc(szhf);
+    CHECK_PTR_FAIL_GOTO( conv_buffer1, "Create buffer fail.", final );
+    cols = (float *)malloc(szhf);
+    CHECK_PTR_FAIL_GOTO( cols, "Create buffer fail.", final );
+    for(i=0; i<w; i++)
+    {
+        get_cols(temp, h, w, i, cols);
+        _convolve_same(cols, h, kernel, ksz, conv_buffer1);
+        set_cols(output, conv_buffer1, h, w, i);
+    }
+
+final:
+    vsi_nn_safe_free(cols);
+    vsi_nn_safe_free(conv_buffer);
+    vsi_nn_safe_free(conv_buffer1);
+    vsi_nn_safe_free(kernel);
+    vsi_nn_safe_free(temp);
+
+    return VSI_SUCCESS;
+}
+
+static vsi_nn_peaks_t *_compute_peaks
+    (
+    float *map_ori,
+    float *map,
+    float *map_left,
+    float *map_right,
+    float *map_up,
+    float *map_down,
+    vsi_nn_cmupose_config_t *config
+    )
+{
+    uint32_t i,j,index;
+    uint32_t width,height;
+    float thre1;
+    float score;
+    vsi_nn_peaks_t *peak = NULL, *peak_list = NULL;
+
+    thre1 = config->param.thre1;
+    width = config->image.width;
+    height = config->image.height;
+
+    for(i=0; i<height; i++)
+    {
+        for(j=0; j<width; j++)
+        {
+            index = i * width + j;
+            if(map[index] >= map_left[index] &&
+                map[index] >= map_right[index] &&
+                map[index] >= map_up[index] &&
+                map[index] >= map_down[index] &&
+                map[index] >= thre1)
+            {
+                peak = (vsi_nn_peaks_t *)
+                    vsi_nn_LinkListNewNode(sizeof(vsi_nn_peaks_t), _init_peak);
+                CHECK_PTR_FAIL_GOTO( peak, "get point fail.", final );
+                score = map_ori[i * width + j];
+
+                peak->peak.id = peak_id;
+                peak->peak.score = score;
+                peak->peak.location[0] = j;
+                peak->peak.location[1] = i;
+
+                vsi_nn_LinkListPushEnd(
+                    (vsi_nn_link_list_t **)&peak_list,
+                    (vsi_nn_link_list_t *)peak );
+
+                #if 0
+                printf("peak[%u %u %f %u]\n",
+                    peak->peak.location[0],
+                    peak->peak.location[1],
+                    peak->peak.score,
+                    peak->peak.id);
+                #endif
+                peak_id++;
+            }
+        }
+    }
+
+final:
+    return peak_list;
+}
+
+static vsi_status _get_score_mid
+    (
+    float *paf_avg,
+    const int32_t *mapIdx_k,
+    vsi_nn_cmupose_config_t *config,
+    float *score_mid
+    )
+{
+    uint32_t w,h,width,height;
+    uint32_t s1,s2,index_src,index_out;
+    uint32_t c1,c2;
+    uint32_t sz_c1_w,sz_c2_w;
+
+    if(NULL == paf_avg || NULL == mapIdx_k || NULL == config || NULL == score_mid)
+    {
+        return VSI_FAILURE;
+    }
+
+    s1 = mapIdx_k[0] - 19;
+    s2 = mapIdx_k[1] - 19;
+    width = config->image.width;
+    height = config->image.height;
+
+    c1 = 2;
+    c2 = 38;
+    sz_c1_w = c1 * width;
+    sz_c2_w = c2 * width;
+    memset(score_mid, 0, sizeof(float) * width * height * c1);
+    for(h=0; h<height; h++)
+    {
+        for(w=0; w<width; w++)
+        {
+            index_out = h * sz_c1_w + w * c1;
+            index_src = h * sz_c2_w + w * c2;
+            score_mid[index_out + 0] = paf_avg[index_src + s1];
+            score_mid[index_out + 1] = paf_avg[index_src + s2];
+        }
+    }
+
+    return VSI_SUCCESS;
+}
+
+static vsi_status _get_peaks
+    (
+    vsi_nn_peaks_t **all_peaks,
+    const int32_t index,
+    vsi_nn_peaks_t **candX,
+    uint32_t *num
+    )
+{
+    vsi_nn_peaks_t *iter, *peak;
+    uint32_t n;
+
+    if(NULL == all_peaks || NULL == candX)
+    {
+        return VSI_FAILURE;
+    }
+
+    peak = all_peaks[index - 1];
+    iter = peak;
+    n = 0;
+    while(iter)
+    {
+    #if 0
+        printf("peak[%u %u %f %u]\n",
+            iter->peak.location[0], iter->peak.location[1], iter->peak.score, iter->peak.id);
+    #endif
+        iter = (vsi_nn_peaks_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter);
+        n++;
+    }
+    *candX = peak;
+    *num = n;
+
+    return VSI_SUCCESS;
+}
+
+static vsi_status _get_peak_data
+    (
+    vsi_nn_peaks_t *peaks,
+    uint32_t index,
+    vsi_nn_peaks_data_t *data
+    )
+{
+    vsi_nn_peaks_t *iter;
+    uint32_t n;
+    vsi_status status;
+
+    status = VSI_FAILURE;
+    if(NULL == peaks || NULL == data)
+    {
+        return status;
+    }
+
+    n = 0;
+    iter = peaks;
+    memset(data, 0, sizeof(vsi_nn_peaks_data_t));
+    while (iter)
+    {
+        if(n == index)
+        {
+            data->id = iter->peak.id;
+            data->score = iter->peak.score;
+            data->location[0] = iter->peak.location[0];
+            data->location[1] = iter->peak.location[1];
+            status = VSI_SUCCESS;
+            break;
+        }
+        iter = (vsi_nn_peaks_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter);
+        n++;
+    }
+
+    return status;
+}
+
+static vsi_status _line_space
+    (
+    float start,
+    float end,
+    uint32_t num,
+    float *outputs
+    )
+{
+    vsi_status status;
+    float step,sum;
+    uint32_t i;
+
+    status = VSI_FAILURE;
+    if(NULL == outputs)
+    {
+        return status;
+    }
+
+    memset(outputs, 0, sizeof(float) * num);
+    step = (end - start) / (num - 1);
+    sum = start;
+    for(i=0; i<num; i++)
+    {
+        outputs[i] = sum;
+        sum += step;
+    }
+
+    status = VSI_SUCCESS;
+    return status;
+}
+
+static vsi_bool _compute_criterion1
+    (
+    float *score_midpts,
+    uint32_t num,
+    float thre2
+    )
+{
+    uint32_t i,nonzero;
+
+    nonzero = 0;
+    for(i=0; i<num; i++)
+    {
+        if(score_midpts[i] > thre2)
+        {
+            nonzero++;
+        }
+    }
+
+    if(nonzero > (0.8 * num))
+    {
+        return TRUE;
+    }
+
+    return FALSE;
+}
+
+static void _sort_con_candidate
+    (
+    vsi_nn_con_candidate_t *node
+    )
+{
+    vsi_nn_con_candidate_t *p = NULL, *q = NULL;
+    vsi_nn_con_candidate_data_t temp;
+
+    p = node;
+    while (p)
+    {
+        q = (vsi_nn_con_candidate_t *)p->link_list.next;
+        while (q)
+        {
+            if(q->data.score > p->data.score)
+            {
+                memmove(&temp, &q->data, sizeof(vsi_nn_con_candidate_data_t));
+                memmove(&q->data, &p->data, sizeof(vsi_nn_con_candidate_data_t));
+                memmove(&p->data, &temp, sizeof(vsi_nn_con_candidate_data_t));
+            }
+            q = (vsi_nn_con_candidate_t *)q->link_list.next;
+        }
+        p = (vsi_nn_con_candidate_t *)p->link_list.next;
+    }
+}
+
+static vsi_nn_con_candidate_t *_get_connection_candidate
+    (
+    float *score_mid,
+    vsi_nn_peaks_t *candA,
+    vsi_nn_peaks_t *candB,
+    uint32_t nA,
+    uint32_t nB,
+    vsi_nn_cmupose_config_t *config,
+    uint32_t *candidate_sum
+    )
+{
+    uint32_t i,j,x,sum;
+    vsi_nn_peaks_data_t candA_data,candB_data;
+    float norm,vec[2],vec_x[10],vec_y[10],score_midpts[10];
+    float score_midpts_sum,score_with_dist_prior;
+    int32_t veci[2],r0,r1;
+    uint32_t mid_num;
+    uint32_t height,width,score_mid_depth,stride;
+    vsi_bool criterion1 = FALSE, criterion2 = FALSE;
+    float linespace1[10],linespace2[10],startend[10][2];
+    vsi_nn_con_candidate_t *con_candidate,*con_candidate_list;
+
+    height = config->image.height;
+    width  = config->image.width;
+    mid_num = _cnt_of_array(linespace1);  //config->param.mid_num;
+    score_mid_depth = 2;
+    stride = width * score_mid_depth;
+    vsi_nn_LinkListInitRoot(con_candidate_list);
+    sum = 0;
+    for(i=0; i<nA; i++)
+    {
+        for(j=0; j<nB; j++)
+        {
+            _get_peak_data(candB, j, &candB_data);
+            _get_peak_data(candA, i, &candA_data);
+            veci[0] = candB_data.location[0] - candA_data.location[0];
+            veci[1] = candB_data.location[1] - candA_data.location[1];
+            norm = sqrtf((float)(veci[0] * veci[0] + veci[1] * veci[1]));
+            vec[0] = veci[0] / norm;
+            vec[1] = veci[1] / norm;
+
+            _line_space((float)candA_data.location[0], (float)candB_data.location[0],
+                mid_num, linespace1);
+            _line_space((float)candA_data.location[1], (float)candB_data.location[1],
+                mid_num, linespace2);
+
+            score_midpts_sum = 0;
+            for(x=0; x<mid_num; x++)
+            {
+                startend[x][0] = linespace1[x];
+                startend[x][1] = linespace2[x];
+                //printf("x=%u [ %f %f ]\n", x, linespace1[x], linespace2[x]);
+
+                r0 = (int32_t)vsi_rint(startend[x][1]);
+                r1 = (int32_t)vsi_rint(startend[x][0]);
+                //printf("r0=%d, r1=%d\n", r0, r1);
+
+                vec_x[x] = score_mid[stride * r0 + score_mid_depth * r1 + 0];
+                vec_y[x] = score_mid[stride * r0 + score_mid_depth * r1 + 1];
+                //printf("vec_x[%u]=%f, vec_y[%u]=%f\n", x, vec_x[x], x, vec_y[x]);
+
+                score_midpts[x] = vec_x[x] * vec[0] + vec_y[x] * vec[1];
+                //printf("score_midpts[%u] = %f\n", x, score_midpts[x]);
+
+                score_midpts_sum += score_midpts[x];
+            }
+            score_with_dist_prior =
+                score_midpts_sum / 10 + vsi_nn_min(0.5f * height / norm - 1, 0);
+
+            criterion1 = _compute_criterion1(score_midpts, 10, config->param.thre2);
+            if(score_with_dist_prior > 0)
+            {
+                criterion2 = TRUE;
+            }
+
+            if(criterion1 && criterion2)
+            {
+                con_candidate = (vsi_nn_con_candidate_t *)
+                    vsi_nn_LinkListNewNode(sizeof(vsi_nn_con_candidate_t), _init_candidate);
+
+                sum++;
+                con_candidate->data.i = i;
+                con_candidate->data.j = j;
+                con_candidate->data.score = score_with_dist_prior;
+                con_candidate->data.candAB =
+                    score_with_dist_prior + candA_data.score + candB_data.score;
+
+                vsi_nn_LinkListPushEnd(
+                    (vsi_nn_link_list_t **)&con_candidate_list,
+                    (vsi_nn_link_list_t *)con_candidate );
+            }
+        }
+    }
+
+    *candidate_sum = sum;
+    return con_candidate_list;
+}
+
+static vsi_status _get_connection_candidate_data
+    (
+    vsi_nn_con_candidate_t *con_candidate,
+    uint32_t index,
+    vsi_nn_con_candidate_data_t *data
+    )
+{
+    vsi_nn_con_candidate_t *iter;
+    uint32_t n;
+
+    if(NULL == con_candidate || NULL == data)
+    {
+        return VSI_FAILURE;
+    }
+
+    n = 0;
+    iter = con_candidate;
+    memset(data, 0, sizeof(vsi_nn_con_candidate_data_t));
+    while(iter)
+    {
+        if(n == index)
+        {
+            data->i = iter->data.i;
+            data->j = iter->data.j;
+            data->score = iter->data.score;
+            data->candAB = iter->data.candAB;
+        }
+        n++;
+        iter = (vsi_nn_con_candidate_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter);
+    }
+
+    return VSI_SUCCESS;
+}
+
+static vsi_bool _check_connection_candidate_ij
+    (
+    vsi_nn_con_candidate_data_t *data,
+    vsi_nn_connection_t *connection
+    )
+{
+    vsi_nn_connection_t *iter;
+    uint32_t i,j;
+    if(NULL == data)
+    {
+        return FALSE;
+    }
+
+    i = data->i;
+    j = data->j;
+    iter = connection;
+    while (iter)
+    {
+        if(iter->data.i == i || iter->data.j == j)
+        {
+            return FALSE;
+        }
+        iter = (vsi_nn_connection_t *)vsi_nn_LinkListNext((vsi_nn_link_list_t *)iter);
+    }
+
+    return TRUE;
+}
+
+static vsi_nn_connection_t *_get_connection
+    (
+    vsi_nn_con_candidate_t *con_candidate,
+    uint32_t candidate_sum,
+    vsi_nn_peaks_t *candA,
+    vsi_nn_peaks_t *candB,
+    uint32_t nA,
+    uint32_t nB,
+    uint32_t *connection_sum
+    )
+{
+    uint32_t c,sum;
+    vsi_nn_connection_t *connection_list,*connection;
+    vsi_nn_con_candidate_data_t candidate_data;
+    vsi_nn_peaks_data_t candA_data,candB_data;
+    vsi_bool ret;
+
+    if(NULL == con_candidate ||
+       NULL == candA ||
+       NULL == candB ||
+       NULL == connection_sum)
+    {
+        return NULL;
+    }
+
+    sum = 0;
+    vsi_nn_LinkListInitRoot(connection_list);
+    for(c=0; c<candidate_sum; c++)
+    {
+        _get_connection_candidate_data(con_candidate, c, &candidate_data);
+
+        ret = _check_connection_candidate_ij(&candidate_data, connection_list);
+        if(ret == TRUE)
+        {
+            _get_peak_data(candB, candidate_data.j, &candB_data);
+            _get_peak_data(candA, candidate_data.i, &candA_data);
+            connection = (vsi_nn_connection_t *)
+                vsi_nn_LinkListNewNode(sizeof(vsi_nn_connection_t), _init_connection);
+            CHECK_PTR_FAIL_GOTO( connection, "get point fail.", final );
+
+            connection->data.i = candidate_data.i;
+            connection->data.j = candidate_data.j;
+            connection->data.score = candidate_data.score;
+            connection->data.x = candA_data.id;
+            connection->data.y = candB_data.id;
+
+            vsi_nn_LinkListPushEnd(
+                (vsi_nn_link_list_t **)&connection_list,
+                (vsi_nn_link_list_t *)connection );
+            sum++;
+
+            #if 0
+            connection = connection_list;
+            printf("======================= c=%u\n",c);
+            while (connection)
+            {
+                printf("[ %u %u %f %u %u ]\n",
+                    connection->data.x, connection->data.y, connection->data.score,
+                    connection->data.i, connection->data.j);
+                connection = (vsi_nn_connection_t *)
+                    vsi_nn_LinkListNext( (vsi_nn_link_list_t *)connection);
+            }
+            #endif
+
+            if(sum >= vsi_nn_min(nA, nB))
+            {
+                break;
+            }
+        }
+    }
+
+    #if 0
+    connection = connection_list;
+    printf("============================\n");
+    while (connection)
+    {
+        printf("[ %u %u %f %u %u ]\n",
+            connection->data.x, connection->data.y, connection->data.score,
+            connection->data.i, connection->data.j);
+        connection = (vsi_nn_connection_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)connection);
+    }
+    #endif
+
+final:
+    return connection_list;
+}
+
+static vsi_nn_peaks_data_t *_get_peak_candidate
+    (
+    vsi_nn_peaks_t **all_peaks,
+    uint32_t all_peaks_num,
+    uint32_t peak_counter
+    )
+{
+    uint32_t i,j;
+    vsi_nn_peaks_data_t *candidate,*iter;
+    vsi_nn_peaks_t *peaks;
+
+    if(NULL == all_peaks)
+    {
+        return NULL;
+    }
+    candidate = (vsi_nn_peaks_data_t *)malloc(sizeof(vsi_nn_peaks_data_t) * peak_counter);
+    CHECK_PTR_FAIL_GOTO( candidate, "Create buffer fail.", final );
+
+    iter = candidate;
+    for(i=0,j=0; i< all_peaks_num; i++,j++)
+    {
+        peaks = all_peaks[i];
+        while (peaks)
+        {
+            memcpy(iter, &peaks->peak, sizeof(vsi_nn_peaks_data_t));
+            peaks = (vsi_nn_peaks_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)peaks);
+            iter++;
+        }
+    }
+
+final:
+    return candidate;
+}
+
+static vsi_bool _check_special_k
+    (
+    int32_t *special_k,
+    uint32_t special_k_num,
+    uint32_t index
+    )
+{
+    uint32_t i;
+
+    for(i=0; i<special_k_num; i++)
+    {
+        if(special_k[i] == (int32_t)index)
+        {
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+}
+
+static vsi_status _get_partXs
+    (
+    vsi_nn_connection_t *connection_k,
+    uint32_t index,
+    int32_t *parts
+    )
+{
+    vsi_nn_connection_t *iter;
+    uint32_t num;
+
+    if(NULL == connection_k)
+    {
+        return VSI_FAILURE;
+    }
+
+    num = 0;
+    iter = connection_k;
+    while (iter)
+    {
+        if(index == 0)
+        {
+            parts[num++] = iter->data.x;
+        }
+        else if(index == 1)
+        {
+            parts[num++] = iter->data.y;
+        }
+        else
+        {
+            return VSI_FAILURE;
+        }
+        iter = (vsi_nn_connection_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter);
+    }
+
+    return VSI_SUCCESS;
+}
+
+static vsi_bool _need_merge
+    (
+    vsi_nn_subset_t *j1_subset,
+    vsi_nn_subset_t *j2_subset
+    )
+{
+    uint32_t i,n;
+
+    if(NULL == j1_subset || NULL == j2_subset)
+    {
+        return FALSE;
+    }
+
+    n = _cnt_of_array(j1_subset->data.idx);
+
+    for(i=0; i<n-2; i++)
+    {
+        int32_t idx1,idx2;
+
+        idx1 = 0, idx2 = 0;
+        if(j1_subset->data.idx[i] >= 0)
+        {
+            idx1 = 1;
+        }
+        if(j2_subset->data.idx[i] >= 0)
+        {
+            idx2 = 1;
+        }
+
+        if(idx1 == 1 && idx2 == 1)
+        {
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
+
+static void _release_all_connection
+    (
+    vsi_nn_connection_t **all_connection,
+    uint32_t connection_list_num
+    )
+{
+    uint32_t i;
+    vsi_nn_connection_t *connection;
+    for(i=0; i<connection_list_num; i++)
+    {
+        connection = all_connection[i];
+        vsi_nn_LinkListDeinit((vsi_nn_link_list_t *)connection, NULL);
+    }
+    if(all_connection)free(all_connection);
+}
+
+static vsi_nn_subset_t *_compute_subset
+    (
+    vsi_nn_connection_t **all_connection,
+    uint32_t all_connection_num,
+    vsi_nn_peaks_data_t *candidate,
+    int32_t *special_k,
+    uint32_t *subset_num
+    )
+{
+    uint32_t i,j,k,n,num;
+    uint32_t mapIdx_len,indexA,indexB;
+    vsi_bool ret;
+    vsi_nn_connection_t *connection_k = NULL;
+    vsi_nn_subset_t *subset_list = NULL, *subset = NULL;
+    uint32_t *deleteIdx = NULL;
+
+    if(NULL == all_connection ||
+       NULL == candidate ||
+       NULL == special_k ||
+       NULL == subset_num)
+    {
+        return NULL;
+    }
+
+    connection_k = NULL;
+    mapIdx_len = _cnt_of_array(mapIdx);
+    num = 0;
+    vsi_nn_LinkListInitRoot(subset_list);
+
+    for(k=0; k<mapIdx_len; k++)
+    {
+        ret = _check_special_k(special_k, mapIdx_len, k);
+        if(ret == TRUE)
+        {
+            int32_t partAs[32];
+            int32_t partBs[32];
+
+            connection_k = all_connection[k];
+
+            memset(partAs, -1, sizeof(int32_t) * 32);
+            memset(partBs, -1, sizeof(int32_t) * 32);
+            _get_partXs(connection_k, 0, partAs);
+            _get_partXs(connection_k, 1, partBs);
+
+            indexA = limbSeq[k][0] - 1;
+            indexB = limbSeq[k][1] - 1;
+
+            n = vsi_nn_LinkListGetNodeNumber((vsi_nn_link_list_t *)connection_k);
+            for(i=0; i<n; i++)
+            {
+                int32_t found = 0;
+                int32_t subset_idx[2] = {-1};
+                vsi_nn_subset_t *sig_subset = NULL;
+                vsi_nn_connection_t *sig_connect = NULL;
+
+                for(j=0; j<num; j++)
+                {
+                    sig_subset= (vsi_nn_subset_t *)
+                        vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)subset_list, j);
+                    if(sig_subset->data.idx[indexA] == partAs[i] ||
+                       sig_subset->data.idx[indexB] == partBs[i])
+                    {
+                        subset_idx[found] = j;
+                        found += 1;
+                    }
+                }
+
+                if(found == 1)
+                {
+                    j = subset_idx[0];
+                    sig_subset= (vsi_nn_subset_t *)
+                        vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)subset_list, j);
+                    CHECK_PTR_FAIL_GOTO( sig_subset, "get point fail.", final );
+                    if(sig_subset->data.idx[indexB] != partBs[i])
+                    {
+                        int32_t ii = partBs[i];
+                        sig_connect = (vsi_nn_connection_t *)
+                            vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+                        sig_subset->data.idx[indexB] = (float)ii;
+                        sig_subset->data.idx[20 - 1] += 1;
+                        sig_subset->data.idx[20 - 2] +=
+                            candidate[ii].score + sig_connect->data.score;
+                    }
+                }
+                else if(found == 2)
+                {
+                    int32_t j1 = subset_idx[0];
+                    int32_t j2 = subset_idx[1];
+                    vsi_nn_subset_t *j1_subset = (vsi_nn_subset_t *)
+                        vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)subset_list, j1);
+                    vsi_nn_subset_t *j2_subset = NULL;
+                    CHECK_PTR_FAIL_GOTO( j1_subset, "get point fail.", final );
+                    j2_subset = (vsi_nn_subset_t *)
+                        vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)subset_list, j2);
+                    CHECK_PTR_FAIL_GOTO( j2_subset, "get point fail.", final );
+                    if(_need_merge(j1_subset, j2_subset) == FALSE)
+                    {
+                        uint32_t ii;
+                        vsi_nn_subset_t *j1_iter = j1_subset;
+                        vsi_nn_subset_t *j2_iter = j2_subset;
+                        sig_connect = (vsi_nn_connection_t *)
+                            vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+                        for(ii=0; ii<(20-2); ii++)
+                        {
+                            j1_iter->data.idx[ii] += j2_iter->data.idx[ii] + 1;
+                        }
+                        for(ii=(20-2); ii<20; ii++)
+                        {
+                            j1_iter->data.idx[ii] += j2_iter->data.idx[ii];
+                        }
+                        j1_iter->data.idx[20 - 2] += sig_connect->data.score;
+                        vsi_nn_LinkListDelIndexNode((vsi_nn_link_list_t **)&subset_list, j2);
+                        num--;
+                    }
+                    else
+                    {
+                        float sum = 0.f;
+                        int32_t ii = partBs[i];
+                        sig_connect = (vsi_nn_connection_t *)
+                            vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+                        sum = candidate[ii].score + sig_connect->data.score;
+                        j1_subset->data.idx[indexB] = (float)ii;
+                        j1_subset->data.idx[20 - 1] += 1;
+                        j1_subset->data.idx[20 - 2] += sum;
+                    }
+                }
+                else if(found == 0 && k < 17)
+                {
+                    float sum = 0.f;
+                    float row[20] = {-1.0};
+                    uint32_t t1 = 0, t2 = 0;
+                    uint32_t l = 0;
+                    float s1 = 0.f, s2 = 0.f;
+                    sig_connect = (vsi_nn_connection_t *)
+                        vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)connection_k, i);
+                    CHECK_PTR_FAIL_GOTO( sig_connect, "get point fail.", final );
+                    t1 = sig_connect->data.x;
+                    t2 = sig_connect->data.y;
+                    s1 = candidate[t1].score;
+                    s2 = candidate[t2].score;
+                    sum = s1 + s2 + sig_connect->data.score;
+
+                    for(l=0; l<_cnt_of_array(row); l++)
+                    {
+                        row[l] = -1.0;
+                    }
+                    row[indexA] = (float)partAs[i];
+                    row[indexB] = (float)partBs[i];
+                    row[20 - 1] = 2.f;
+                    row[20 - 2] = sum;
+
+                    subset = (vsi_nn_subset_t *)
+                        vsi_nn_LinkListNewNode(sizeof(vsi_nn_subset_t), _init_subset);
+
+                    memcpy(&subset->data, row, sizeof(float) * 20);
+
+                    vsi_nn_LinkListPushEnd(
+                        (vsi_nn_link_list_t **)&subset_list,
+                        (vsi_nn_link_list_t *)subset );
+                    num++;
+                }
+
+                connection_k = (vsi_nn_connection_t *)
+                    vsi_nn_LinkListNext( (vsi_nn_link_list_t *)connection_k);
+            } /* end for(i=0; i<n; i++) */
+        } /* end if(ret == TRUE) */
+    } /* end for(k=0; k<mapIdx_len; k++) */
+
+    deleteIdx = (uint32_t *)malloc(sizeof(uint32_t) * num);
+    CHECK_PTR_FAIL_GOTO( deleteIdx, "Create buffer fail.", final );
+    memset(deleteIdx, -1, sizeof(uint32_t) * num);
+
+    subset = subset_list;
+    for(i=0,j=0; i<num; i++)
+    {
+        float tmp1 = subset->data.idx[20 - 1];
+        float tmp2 = subset->data.idx[20 - 2];
+        if(tmp1 < 4 || (tmp2 / tmp1) < 0.4)
+        {
+            deleteIdx[j++] = i;
+        }
+        subset = (vsi_nn_subset_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)subset);
+    }
+    for(i=0; i<num; i++)
+    {
+        if(deleteIdx[i] != -1)
+        {
+            vsi_nn_LinkListDelIndexNode((vsi_nn_link_list_t **)&subset_list, deleteIdx[i]);
+            num--;
+        }
+    }
+
+    #if 0
+    n = 0;
+    subset = subset_list;
+    while (subset)
+    {
+        printf("================= n=%u\n", n);
+        for(i=0; i<20; i++)
+        {
+            printf("[%d] = %f\n", i, subset->data.idx[i]);
+        }
+        subset = (vsi_nn_subset_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)subset);
+        n++;
+    }
+    #endif
+
+final:
+    if(deleteIdx)free(deleteIdx);
+    return subset_list;
+}
+
+static vsi_nn_connection_t **_compute_all_connetion
+    (
+    float *paf_avg,
+    vsi_nn_peaks_t **all_peaks,
+    vsi_nn_cmupose_config_t *config,
+    uint32_t *connection_list_num,
+    int32_t **special
+    )
+{
+    uint32_t k;
+    float *score_mid;
+    vsi_nn_peaks_t *candA = NULL, *candB = NULL;
+    uint32_t nA,nB;
+    uint32_t height,width,score_mid_depth;
+    vsi_nn_con_candidate_t *con_candidate_list;
+    vsi_nn_connection_t **connection_all = NULL;
+    int32_t *special_k = NULL;
+    uint32_t mapIdx_len,candidate_sum,connection_sum;
+
+    mapIdx_len = _cnt_of_array(mapIdx);
+    height = config->image.height;
+    width  = config->image.width;
+    score_mid_depth = 2;
+
+    score_mid = (float *)malloc(sizeof(float) * height * width * score_mid_depth);
+    CHECK_PTR_FAIL_GOTO( score_mid, "Create buffer fail.", final );
+    connection_all = (vsi_nn_connection_t **)malloc(sizeof(vsi_nn_connection_t *) * mapIdx_len);
+    special_k = (int32_t *)malloc(sizeof(int32_t) * mapIdx_len);
+    CHECK_PTR_FAIL_GOTO( special_k, "Create buffer fail.", final );
+
+    memset(connection_all, 0, sizeof(vsi_nn_connection_t *) * mapIdx_len);
+    memset(special_k, -1, sizeof(int32_t) * mapIdx_len);
+    for(k=0; k<mapIdx_len; k++)
+    {
+        _get_score_mid(paf_avg, mapIdx[k], config, score_mid);
+        //savetxt("sheldon/score_mid.txt", score_mid, 320*320*2);
+
+        candA = NULL, candB = NULL;
+        nA = 0, nB = 0;
+        _get_peaks(all_peaks, limbSeq[k][0], &candA, &nA);
+        _get_peaks(all_peaks, limbSeq[k][1], &candB, &nB);
+
+        if(nA != 0 && nB != 0)
+        {
+            candidate_sum = 0;
+            con_candidate_list = _get_connection_candidate(score_mid,
+                candA, candB,
+                nA, nB,
+                config,
+                &candidate_sum);
+
+#if 0
+            printf("=======================================\n");
+            iter = con_candidate_list;
+            while (iter)
+            {
+                printf("con_candidate[ %u %u %f %f ]\n",
+                    iter->data.i,iter->data.j,iter->data.score,iter->data.candAB);
+                iter = (vsi_nn_con_candidate_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter);
+            }
+#endif
+            _sort_con_candidate(con_candidate_list);
+#if 0
+            printf("======================================= k=%u\n",k);
+            iter = con_candidate_list;
+            while (iter)
+            {
+                printf("con_candidate[ %u %u %f %f ]\n",
+                    iter->data.i,iter->data.j,iter->data.score,iter->data.candAB);
+                iter = (vsi_nn_con_candidate_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter);
+            }
+#endif
+
+            connection_sum = 0;
+            connection_all[k] = _get_connection(con_candidate_list,
+                candidate_sum,
+                candA, candB,
+                nA, nB,
+                &connection_sum);
+#if 0
+            printf("=======================================\n",k);
+            iter = connection_all[k];
+            while (iter)
+            {
+                printf("connection[%u] = [ %u %u %f %u %u ]\n",
+                    iter->data.x,iter->data.y,iter->data.score,iter->data.i,iter->data.j);
+                iter = (vsi_nn_connection_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter);
+            }
+#endif
+
+            vsi_nn_LinkListDeinit((vsi_nn_link_list_t *)con_candidate_list, NULL);
+        }
+        else
+        {
+            special_k[k] = k;
+            connection_all[k] = NULL;
+        }
+    }
+
+    *connection_list_num = mapIdx_len;
+    *special = special_k;
+
+final:
+    vsi_nn_safe_free(score_mid);
+    return connection_all;
+}
+
+static vsi_nn_peaks_t **_compute_all_peaks
+    (
+    float *heatmap_avg,
+    vsi_nn_cmupose_config_t *config,
+    uint32_t *peak_conter,
+    uint32_t *peak_list_num
+    )
+{
+    uint32_t i, part, loop;
+    uint32_t width,height;
+    vsi_size_t sz, j;
+    float *map_ori = NULL, *map = NULL;
+    float *pheatmap_avg = NULL, *pmap_ori = NULL;
+    float *map_left = NULL, *map_right = NULL, *map_up = NULL, *map_down = NULL;
+    vsi_nn_peaks_t **all_peaks = NULL;
+
+    vsi_size_t szf;
+
+    if(NULL == heatmap_avg || NULL == peak_conter)
+    {
+        return NULL;
+    }
+
+    width = config->image.width;
+    height = config->image.height;
+    sz = width * height;
+    szf = sizeof(float) * sz;
+    map_ori   = (float *)malloc(szf);
+    CHECK_PTR_FAIL_GOTO( map_ori, "Create buffer fail.", final );
+    map       = (float *)malloc(szf);
+    CHECK_PTR_FAIL_GOTO( map, "Create buffer fail.", final );
+    map_left  = (float *)malloc(szf);
+    CHECK_PTR_FAIL_GOTO( map_left, "Create buffer fail.", final );
+    map_right = (float *)malloc(szf);
+    CHECK_PTR_FAIL_GOTO( map_right, "Create buffer fail.", final );
+    map_up    = (float *)malloc(szf);
+    CHECK_PTR_FAIL_GOTO( map_up, "Create buffer fail.", final );
+    map_down  = (float *)malloc(szf);
+    CHECK_PTR_FAIL_GOTO( map_down, "Create buffer fail.", final );
+
+    loop = 19 -1; // why?
+    all_peaks = (vsi_nn_peaks_t **)malloc(sizeof(vsi_nn_peaks_t *) * loop);
+    CHECK_PTR_FAIL_GOTO( all_peaks, "Create buffer fail.", final );
+    memset(all_peaks, 0, sizeof(vsi_nn_peaks_t *) * loop);
+    peak_id = 0;
+
+    //heatmap_avg[320][320][19]
+    for(part=0; part<loop; part++)
+    {
+        memset(map_ori,   0, szf);
+        memset(map,       0, szf);
+
+        pheatmap_avg = heatmap_avg + part;
+        pmap_ori = map_ori;
+        for(j=0; j<sz; j++)
+        {
+            *pmap_ori = *pheatmap_avg;
+            pmap_ori += 1;
+            pheatmap_avg += 19;
+        }
+        //savetxt("sheldon/map_ori.txt", map_ori, sz);
+
+        gaussian_filter(map_ori, 3.f, config, map);
+        //savetxt("sheldon/map.txt", map, sz);
+
+        memset(map_left,  0, szf);
+        memcpy(map_left + width, map, sizeof(float) * (sz - width));
+        //savetxt("sheldon/map_left.txt", map_left, sz);
+
+        memset(map_right, 0, szf);
+        memcpy(map_right, map + width, sizeof(float) * (sz - width));
+        //savetxt("sheldon/map_right.txt", map_right, sz);
+
+        memset(map_up,    0, szf);
+        memset(map_down,  0, szf);
+        for(i=0; i<height; i++)
+        {
+            memcpy(map_up + i * width + 1, map + i * width, sizeof(float) * (width - 1));
+            memcpy(map_down + i * width, map + i * width + 1, sizeof(float) * (width - 1));
+        }
+        //savetxt("sheldon/map_up.txt", map_up, sz);
+        //savetxt("sheldon/map_down.txt", map_down, sz);
+
+        all_peaks[part] = _compute_peaks(map_ori,
+            map,
+            map_left,
+            map_right,
+            map_up,
+            map_down,
+            config);
+    }
+
+    *peak_list_num = loop;
+    *peak_conter = peak_id;
+
+final:
+    vsi_nn_safe_free(map);
+    vsi_nn_safe_free(map_ori);
+    vsi_nn_safe_free(map_left);
+    vsi_nn_safe_free(map_right);
+    vsi_nn_safe_free(map_up);
+    vsi_nn_safe_free(map_down);
+    return all_peaks;
+}
+
+static void _fill_heatmap_avg
+    (
+    float *net_out,
+    vsi_nn_cmupose_config_t *config,
+    float *heatmap_avg
+    )
+{
+    uint32_t net_out_c,width,height,channel;
+    uint32_t i,j,index_net_out,index_buffer;
+    float *buffer = NULL;
+    vsi_size_t sz,szf,channelf;
+    uint32_t stride_h_nc,stride_h_c;
+
+    // shape = [width][height][channel]
+    uint32_t size1[3] = {0};
+    uint32_t size2[3] = {0};
+    size1[0] = (uint32_t)config->inputs.net_out->attr.size[1];
+    size1[1] = (uint32_t)config->inputs.net_out->attr.size[0];
+    size1[2] = 19;
+
+    size2[0] = config->image.height;
+    size2[1] = config->image.width;
+    size2[2] = 19;
+
+    net_out_c = 57;
+    width = size1[1];
+    height = size1[0];
+    channel = 19;
+    sz = width * height * channel;
+    szf = sizeof(float) * sz;
+    channelf = channel*sizeof(float);
+    buffer = (float *)malloc(szf);
+    CHECK_PTR_FAIL_GOTO( buffer, "Create buffer fail.", final );
+    memset(buffer, 0, szf);
+
+    stride_h_nc = height * net_out_c;
+    stride_h_c  = height * channel;
+    for(i=0; i<height; i++)
+    {
+        for(j=0; j<width; j++)
+        {
+            index_net_out = i * stride_h_nc + j * net_out_c;
+            index_buffer = i * stride_h_c + j * channel;
+            memcpy(buffer + index_buffer, net_out + index_net_out, channelf);
+        }
+    }
+    //savetxt("sheldon/heatmap.txt", buffer, sz);
+    resize_nearest(buffer, size1, heatmap_avg, size2);
+    //savetxt("sheldon/heatmap_avg.txt", heatmap_avg, config->image.height*config->image.width*19);
+
+final:
+    vsi_nn_safe_free(buffer);
+}
+
+static void _fill_paf_avg
+    (
+    float *net_out,
+    vsi_nn_cmupose_config_t *config,
+    float *paf_avg
+    )
+{
+    uint32_t net_out_c,width,height,channel;
+    vsi_size_t i,j,index_net_out,index_buffer;
+    float *buffer = NULL;
+
+    vsi_size_t sz,szf,cf,stride_h_nc,stride_h_c;
+
+    //[width, height, channel]
+    uint32_t size1[3] = {0};
+    uint32_t size2[3] = {0};
+    size1[0] = (uint32_t)config->inputs.net_out->attr.size[1];
+    size1[1] = (uint32_t)config->inputs.net_out->attr.size[0];
+    size1[2] = 38;
+
+    size2[0] = config->image.height;
+    size2[1] = config->image.width;
+    size2[2] = 38;
+
+    net_out_c = 57;
+    width = size1[1];
+    height = size1[0];
+    channel = 38;
+    sz = width * height * channel;
+    szf = sizeof(float) * sz;
+    cf = channel*sizeof(float);
+    buffer = (float *)malloc(szf);
+    CHECK_PTR_FAIL_GOTO( buffer, "Create buffer fail.", final );
+    memset(buffer, 0, szf);
+
+    stride_h_nc = height * net_out_c;
+    stride_h_c  = height * channel;
+    for(i=0; i<height; i++)
+    {
+        for(j=0; j<width; j++)
+        {
+            index_net_out = i * stride_h_nc + j * net_out_c + 19; // 19 = 57 - 38
+            index_buffer = i * stride_h_c + j * channel;
+            memcpy(buffer + index_buffer, net_out + index_net_out, cf);
+        }
+    }
+    //savetxt("sheldon/paf.txt", buffer, sz);
+    resize_nearest(buffer, size1, paf_avg, size2);
+    //savetxt("sheldon/paf_avg.txt", paf_avg, config->image.width*config->image.height*38);
+
+final:
+    vsi_nn_safe_free(buffer);
+}
+
+vsi_status vsi_nn_CMUPose_Post_Process
+    (
+    float *net_out,
+    vsi_nn_cmupose_config_t *config,
+    vsi_nn_peaks_t ***all_peaks_out,
+    uint32_t *all_peaks_num_out,
+    vsi_nn_subset_t **subset_list_out,
+    vsi_nn_peaks_data_t **peak_candidate_out,
+    uint32_t *peak_candidate_num_out
+    )
+{
+    vsi_status status;
+    vsi_nn_cmupose_multiplier_t multiplier;
+    float *heatmap_avg = NULL, *paf_avg = NULL;
+    vsi_nn_peaks_t **all_peaks = NULL;
+    vsi_nn_peaks_data_t *peak_candidate = NULL;
+    vsi_nn_connection_t **all_connection = NULL;
+    vsi_nn_subset_t *subset_list = NULL;
+    int32_t *special_k = NULL;
+    uint32_t peak_counter = 0;
+    uint32_t peak_list_num = 0, connection_list_num = 0, subset_num = 0;
+    //uint32_t n;
+
+    status = VSI_FAILURE;
+    if(NULL == config ||
+        NULL == all_peaks_out ||
+        NULL == all_peaks_num_out ||
+        NULL == subset_list_out ||
+        NULL == peak_candidate_out ||
+        NULL == peak_candidate_num_out)
+    {
+        return status;
+    }
+
+    status = _cmupose_init(config, &multiplier, &heatmap_avg, &paf_avg);
+    if(VSI_SUCCESS != status)
+    {
+        return status;
+    }
+
+    _fill_heatmap_avg(net_out, config, heatmap_avg);
+    _fill_paf_avg(net_out, config, paf_avg);
+
+    all_peaks = _compute_all_peaks(heatmap_avg, config, &peak_counter, &peak_list_num);
+#if 0
+    for(n=0; n<peak_list_num; n++)
+    {
+        vsi_nn_peaks_t *iter = all_peaks[n];
+        printf("peak_list[%u]=================\n", n);
+        while(iter)
+        {
+            printf("peak[%u %u %f %u]\n",
+                iter->peak.location[0], iter->peak.location[1], iter->peak.score, iter->peak.id);
+            iter = (vsi_nn_peaks_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter);
+        }
+    }
+#endif
+
+    all_connection = _compute_all_connetion(paf_avg, all_peaks, config, &connection_list_num, &special_k);
+#if 0
+    for(n=0; n<connection_list_num; n++)
+    {
+        vsi_nn_connection_t *iter = all_connection[n];
+        printf("all_connection[%u]=================total[%u]\n", n, connection_list_num);
+        while(iter)
+        {
+            printf("connection[%u %u %f %u %u]\n",
+                iter->data.x, iter->data.y, iter->data.score, iter->data.i, iter->data.j);
+            iter = (vsi_nn_connection_t *)vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter);
+        }
+    }
+#endif
+
+    peak_candidate = _get_peak_candidate(all_peaks, peak_list_num, peak_counter);
+#if 0
+    for(n=0; n<peak_counter; n++)
+    {
+        printf("peak_candidate[%u] = [ %u %u %f %u ]\n", n,
+            peak_candidate[n].location[0],peak_candidate[n].location[1],
+            peak_candidate[n].score,peak_candidate[n].id);
+    }
+#endif
+
+    subset_list = _compute_subset(all_connection,
+        connection_list_num,
+        peak_candidate,
+        special_k,
+        &subset_num);
+
+    *all_peaks_out = all_peaks;
+    *all_peaks_num_out = peak_list_num;
+    *subset_list_out = subset_list;
+    *peak_candidate_out = peak_candidate;
+    *peak_candidate_num_out = peak_counter;
+    status = VSI_SUCCESS;
+
+    _cmupose_deinit(&multiplier, heatmap_avg, paf_avg);
+    _release_all_connection(all_connection, connection_list_num);
+    if(special_k)free(special_k);
+    return status;
+}
+
+static float *_get_net_out_data
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_cmupose_config_t *config
+    )
+{
+    vsi_nn_tensor_t tensor,*net_out;
+    vsi_size_t perm[3] = {1,2,0};
+    float *buffer;
+    vsi_bool ret = FALSE;
+
+    net_out = config->inputs.net_out;
+    memset(&tensor, 0, sizeof(vsi_nn_tensor_t));
+    tensor.attr.dim_num = net_out->attr.dim_num;
+    tensor.attr.size[0] = net_out->attr.size[2];
+    tensor.attr.size[1] = net_out->attr.size[1];
+    tensor.attr.size[2] = net_out->attr.size[0];
+    memcpy(&tensor.attr.dtype, &net_out->attr.dtype, sizeof(vsi_nn_dtype_t));
+    ret = vsi_nn_ReshapeTensor(graph,
+                               net_out,
+                               &tensor,
+                               tensor.attr.size,
+                               tensor.attr.dim_num);
+    if(ret != TRUE)
+    {
+        return NULL;
+    }
+
+    vsi_nn_TransposeTensor(graph, &tensor, perm, tensor.attr.dim_num, NULL);
+    buffer  = vsi_nn_ConvertTensorToFloat32Data(graph, &tensor);
+
+    // for test
+    //savetxt("sheldon/net_out.txt", buffer, (net_out->attr.size[0]*net_out->attr.size[1]*57));
+
+    //vxReleaseTensor(&tensor.t);
+    return buffer;
+}
+
+static vsi_status _auto_fill_cmupose
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_cmupose_inputs_t *inputs,
+    vsi_nn_cmupose_image_t *image,
+    vsi_nn_cmupose_param_t *param,
+    vsi_nn_cmupose_model_t *model,
+    vsi_nn_cmupose_config_t *cmupose_config
+    )
+{
+    vsi_status status;
+    vsi_nn_tensor_t *net_in = NULL;
+    static float default_scale_search[1] = {1};
+
+    status = VSI_FAILURE;
+    if(NULL == graph)
+    {
+        return status;
+    }
+
+    // fill input
+    if(inputs == NULL)
+    {
+        cmupose_config->inputs.net_out = vsi_nn_GetTensor(graph, graph->output.tensors[0]);
+    }
+    else
+    {
+        cmupose_config->inputs.net_out = inputs->net_out;
+    }
+
+    // fill image
+    net_in = vsi_nn_GetTensor(graph, graph->input.tensors[0]);
+    CHECK_PTR_FAIL_GOTO( net_in, "Create tensor fail.", final );
+    cmupose_config->image.width = (uint32_t)net_in->attr.size[0];
+    cmupose_config->image.height = (uint32_t)net_in->attr.size[1];
+    cmupose_config->image.channel = (uint32_t)net_in->attr.size[2];
+
+    // fill param
+    cmupose_config->param.scale_search.size = default_scale_search;
+    cmupose_config->param.scale_search.num = _cnt_of_array(default_scale_search);
+    cmupose_config->param.mid_num = 10;
+    cmupose_config->param.thre1 = 0.1f;
+    cmupose_config->param.thre2 = 0.05f;
+    cmupose_config->param.thre3 = 0.5f;
+
+    // fill model
+    cmupose_config->model.boxsize = 368;
+    cmupose_config->model.stride = 8;
+    cmupose_config->model.padValue = 128;
+
+    status = VSI_SUCCESS;
+
+final:
+    return status;
+}
+
+vsi_status vsi_nn_CMUPose_PostProcess
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_cmupose_inputs_t *inputs,
+    vsi_nn_cmupose_image_t *image,
+    vsi_nn_cmupose_param_t *param,
+    vsi_nn_cmupose_model_t *model,
+    vsi_nn_peaks_t ***all_peaks,
+    uint32_t *all_peaks_num,
+    vsi_nn_peaks_data_t **candidate,
+    uint32_t *candidate_num,
+    vsi_nn_subset_t **subset
+    )
+{
+    vsi_status status;
+    float *net_out;
+    vsi_nn_cmupose_config_t cmupose_config;
+
+    status = VSI_FAILURE;
+    net_out = NULL;
+
+    memset(&cmupose_config, 0, sizeof(vsi_nn_cmupose_config_t));
+    status = _auto_fill_cmupose(graph,
+        inputs,
+        image,
+        param,
+        model,
+        &cmupose_config);
+    if(VSI_SUCCESS != status)
+    {
+        goto final;
+    }
+
+    net_out = _get_net_out_data(graph, &cmupose_config);
+    if(NULL == net_out)
+    {
+        goto final;
+    }
+
+    status = vsi_nn_CMUPose_Post_Process(net_out,
+                                         &cmupose_config,
+                                         all_peaks,
+                                         all_peaks_num,
+                                         subset,
+                                         candidate,
+                                         candidate_num);
+    if(VSI_SUCCESS != status)
+    {
+        goto final;
+    }
+
+    status = VSI_SUCCESS;
+final:
+    if(net_out)free(net_out);
+    return status;
+}
diff --git a/src/tim/vx/internal/src/post/vsi_nn_post_fasterrcnn.c b/src/tim/vx/internal/src/post/vsi_nn_post_fasterrcnn.c
new file mode 100644
index 0000000..2a9ac0d
--- /dev/null
+++ b/src/tim/vx/internal/src/post/vsi_nn_post_fasterrcnn.c
@@ -0,0 +1,702 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "vsi_nn_context.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_node_attr_template.h"
+#include "vsi_nn_log.h"
+#include "utils/vsi_nn_math.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_util.h"
+#include "post/vsi_nn_post_fasterrcnn.h"
+#include "vsi_nn_error.h"
+
+/*
+    faster-rcnn default image classes -- 21
+*/
+static const char* FASTER_RCNN_CLASSES[] =
+    {"__background__",
+    "aeroplane", "bicycle", "bird", "boat",
+    "bottle", "bus", "car", "cat", "chair",
+    "cow", "diningtable", "dog", "horse",
+    "motorbike", "person", "pottedplant",
+    "sheep", "sofa", "train", "tvmonitor"};
+#define VSI_NN_FASTERRCNN_CLASSES_NUM _cnt_of_array(FASTER_RCNN_CLASSES)
+
+static void _dump_boxes
+    (
+    vsi_nn_fasterrcnn_box_t **box,
+    vsi_nn_fasterrcnn_param_t *param
+    );
+
+static vsi_status _fill_fasterrcnn_param
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_fasterrcnn_param_t *param
+    );
+
+static vsi_status _fill_fasterrcnn_inputs
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_fasterrcnn_param_t *param,
+    vsi_nn_fasterrcnn_inputs_t *inputs
+    );
+
+static vsi_status _unscale_roi
+    (
+    float *rois,
+    vsi_nn_fasterrcnn_param_t *param
+    );
+
+static vsi_status _bbox_transform_inv
+    (
+    float *rois,
+    float *bbox,
+    vsi_nn_fasterrcnn_param_t *param,
+    float **boxes
+    );
+
+static float detection_box_iou
+    (
+    float *A,
+    float *B
+    );
+
+static void detection_box_nms
+    (
+    float *box,
+    float thresh,
+    uint32_t rois_num,
+    uint32_t *keep,
+    uint32_t *num
+    );
+
+static void detection_box_qsort
+    (
+    float *box,
+    int32_t start,
+    int32_t end
+    );
+
+static void _init_box
+    (
+    vsi_nn_link_list_t *node
+    );
+
+static void _dump_boxes
+    (
+    vsi_nn_fasterrcnn_box_t **box,
+    vsi_nn_fasterrcnn_param_t *param
+    );
+
+static vsi_status _fill_fasterrcnn_param
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_fasterrcnn_param_t *param
+    )
+{
+    vsi_status status;
+    uint32_t i;
+    vsi_nn_node_t   *node;
+    vsi_nn_tensor_t *tensor;
+
+    if(NULL == graph || NULL == param)
+    {
+        return VSI_FAILURE;
+    }
+
+    status = VSI_FAILURE;
+    tensor = NULL;
+
+    for(i=0; i<graph->node_num; i++)
+    {
+        node = vsi_nn_GetNode( graph, (vsi_nn_node_id_t)i );
+        //printf("i[%u] op[%s]\n", i, vsi_nn_OpGetName(node->op));
+        if(node->op == VSI_NN_OP_PROPOSAL)
+        {
+            memcpy(&param->iminfo, &node->nn_param.proposal.im_info,
+                    sizeof(vsi_nn_proposal_im_info));
+            tensor = vsi_nn_GetTensor(graph,node->output.tensors[0]);
+            param->rois_num = (uint32_t)tensor->attr.size[1];
+        }
+    }
+
+    if(0 == param->rois_num)
+    {
+        VSILOGE("Can not find [Proposal] layer in network");
+        return status;
+    }
+    status = VSI_SUCCESS;
+
+    /* fill default parameters */
+#define VSI_NN_FASTERRCNN_DEF_CONF_THRESH (0.7f)
+#define VSI_NN_FASTERRCNN_DEF_NMS_THRESH (0.3f)
+    param->conf_thresh = VSI_NN_FASTERRCNN_DEF_CONF_THRESH;
+    param->nms_thresh = VSI_NN_FASTERRCNN_DEF_NMS_THRESH;
+    param->classes_num = VSI_NN_FASTERRCNN_CLASSES_NUM;
+    param->classes = FASTER_RCNN_CLASSES;
+
+    return status;
+} /* _fill_fasterrcnn_param() */
+
+static vsi_status _fill_fasterrcnn_inputs
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_fasterrcnn_param_t *param,
+    vsi_nn_fasterrcnn_inputs_t *inputs
+    )
+{
+    vsi_status status;
+    uint32_t i,rois_num,size[2],dim;
+    vsi_nn_tensor_t *tensor;
+
+    if(NULL == graph || NULL == inputs)
+    {
+        return VSI_FAILURE;
+    }
+
+    status = VSI_FAILURE;
+    tensor = NULL;
+    rois_num = param->rois_num;
+    for(i=0; i<graph->output.num; i++)
+    {
+        /* bbox [84,rois] */
+        /* cls  [21,rois] */
+        /* rois [5,rois] */
+        tensor = vsi_nn_GetTensor(graph, graph->output.tensors[i]);
+        CHECK_PTR_FAIL_GOTO( tensor, "get tensor fail.", final );
+        size[0] = (uint32_t)tensor->attr.size[0];
+        size[1] = (uint32_t)tensor->attr.size[1];
+        dim = tensor->attr.dim_num;
+        if(dim == 2 && size[1] == rois_num)
+        {
+            switch (size[0])
+            {
+            case 5:
+                inputs->rois = tensor;
+                break;
+            case 21:
+                inputs->cls = tensor;
+                break;
+            case 84:
+                inputs->bbox = tensor;
+                break;
+            default:
+                break;
+            }
+        }
+    }
+
+final:
+    if(inputs->rois == NULL ||
+       inputs->cls == NULL ||
+       inputs->bbox == NULL)
+    {
+        VSILOGE("Can not find [rois,cls,bbox] tensor in network");
+        return status;
+    }
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _fill_fasterrcnn_inputs() */
+
+static vsi_status _unscale_roi
+    (
+    float *rois,
+    vsi_nn_fasterrcnn_param_t *param
+    )
+{
+    uint32_t i;
+    float *data;
+
+    data = rois;
+    for(i=0; i<param->rois_num; i++)
+    {
+        data[1] = data[1] / param->iminfo.scale[0];
+        data[2] = data[2] / param->iminfo.scale[1];
+        data[3] = data[3] / param->iminfo.scale[0];
+        data[4] = data[4] / param->iminfo.scale[1];
+
+        data += 5;
+    }
+
+    return VSI_SUCCESS;
+}
+
+static vsi_status _bbox_transform_inv
+    (
+    float *rois,
+    float *bbox,
+    vsi_nn_fasterrcnn_param_t *param,
+    float **boxes
+    )
+{
+    float *pred_boxes = NULL, *ppred = NULL;
+    float *proi,*pbbox;
+    uint32_t i,j,rois_num,bbox_num,class_num;
+    float img_w,img_h;
+    vsi_status status = VSI_FAILURE;
+
+    float w,h,ctr_x,ctr_y;
+    float dx,dy,dw,dh;
+    float pred_ctr_x,pred_ctr_y,pred_w,pred_h;
+
+    img_w = param->iminfo.size[0];
+    img_h = param->iminfo.size[1];
+    rois_num = param->rois_num;
+    class_num = param->classes_num;
+    bbox_num = class_num * 4;
+    pred_boxes = (float *)malloc(sizeof(float) * rois_num * bbox_num);
+    CHECK_PTR_FAIL_GOTO( pred_boxes, "Create buffer fail.", final );
+    status = VSI_SUCCESS;
+
+    proi = rois;
+    pbbox = bbox;
+    ppred = pred_boxes;
+    for(i=0; i<rois_num; i++)
+    {
+        /* roi_data {0,x1,y1,x2,y2} */
+        w = proi[3] - proi[1] + 1.0f;
+        h = proi[4] - proi[2] + 1.0f;
+        ctr_x = proi[1] + 0.5f * w;
+        ctr_y = proi[2] + 0.5f * h;
+
+        /* bbox {rois_num,84} */
+        for(j=0; j<class_num; j++)
+        {
+            dx = pbbox[0];
+            dy = pbbox[1];
+            dw = pbbox[2];
+            dh = pbbox[3];
+
+            pred_ctr_x = dx * w + ctr_x;
+            pred_ctr_y = dy * h + ctr_y;
+            pred_w = expf(dw) * w;
+            pred_h = expf(dh) * h;
+
+            /* update upper-left corner location */
+            ppred[0] = pred_ctr_x - 0.5f * pred_w;
+            ppred[1] = pred_ctr_y - 0.5f * pred_h;
+
+            /* update lower-right corner location */
+            ppred[2] = pred_ctr_x + 0.5f * pred_w;
+            ppred[3] = pred_ctr_y + 0.5f * pred_h;
+
+            /* adjust new corner locations to be within the image region */
+            ppred[0] = vsi_nn_max(0.0f, vsi_nn_min(ppred[0], img_w - 1.0f));
+            ppred[1] = vsi_nn_max(0.0f, vsi_nn_min(ppred[1], img_h - 1.0f));
+            ppred[2] = vsi_nn_max(0.0f, vsi_nn_min(ppred[2], img_w - 1.0f));
+            ppred[3] = vsi_nn_max(0.0f, vsi_nn_min(ppred[3], img_h - 1.0f));
+
+            pbbox += 4;
+            ppred += 4;
+        }
+
+        proi += 5;
+    }
+
+    *boxes = pred_boxes;
+
+final:
+    return status;
+}
+
+static float detection_box_iou
+    (
+    float *A,
+    float *B
+    )
+{
+    float x1,y1,x2,y2,width,height,area,A_area,B_area;
+
+    if (A[0] > B[2] || A[1] > B[3] || A[2] < B[0] || A[3] < B[1])
+    {
+        return 0;
+    }
+
+    /* overlapped region (=box) */
+    x1 = vsi_nn_max(A[0], B[0]);
+    y1 = vsi_nn_max(A[1], B[1]);
+    x2 = vsi_nn_min(A[2], B[2]);
+    y2 = vsi_nn_min(A[3], B[3]);
+
+    /* intersection area */
+    width    = vsi_nn_max(0.0f, x2 - x1 + 1.0f);
+    height   = vsi_nn_max(0.0f, y2 - y1 + 1.0f);
+    area     = width * height;
+
+    /* area of A, B */
+    A_area   = (A[2] - A[0] + 1.0f) * (A[3] - A[1] + 1.0f);
+    B_area   = (B[2] - B[0] + 1.0f) * (B[3] - B[1] + 1.0f);
+
+    /* IOU */
+    return area / (A_area + B_area - area);
+}
+
+static void detection_box_nms
+    (
+    float *box,
+    float thresh,
+    uint32_t rois_num,
+    uint32_t *keep,
+    uint32_t *num
+    )
+{
+    uint32_t i,j;
+    uint32_t *is_dead = NULL;
+
+    is_dead = (uint32_t *)malloc(sizeof(uint32_t) * rois_num);
+    CHECK_PTR_FAIL_GOTO( is_dead, "Create buffer fail.", final );
+    memset(is_dead, 0, sizeof(uint32_t) * rois_num);
+
+    for(i = 0; i < rois_num; i++)
+    {
+        if(is_dead[i])
+        {
+            continue;
+        }
+
+        for(j = i + 1; j < rois_num; ++j)
+        {
+            if(!is_dead[j] && detection_box_iou(&box[i * 5], &box[j * 5]) > thresh)
+            {
+                is_dead[j] = 1;
+            }
+        }
+    }
+
+    j = 0;
+    for(i=0; i<rois_num; i++)
+    {
+        if(!is_dead[i])
+        {
+            keep[j] = i;
+            j++;
+        }
+    }
+    *num = j;
+
+final:
+    vsi_nn_safe_free(is_dead);
+}
+
+static void detection_box_qsort
+    (
+    float *box,
+    int32_t start,
+    int32_t end
+    )
+{
+    /*
+        box[x] = {x1, y1, x2, y2, score};
+    */
+    int i;
+    float pivot_score = box[start * 5 + 4];
+    int32_t left = start + 1, right = end;
+    float temp[5];
+
+    while (left <= right)
+    {
+        while(left <= end && box[left * 5 + 4] >= pivot_score)
+            ++left;
+        while (right > start && box[right * 5 + 4] <= pivot_score)
+            --right;
+
+        if (left <= right)
+        {
+            /* swap box */
+            for(i = 0; i < 5; ++i)
+            {
+                temp[i] = box[left * 5 + i];
+            }
+            for(i = 0; i < 5; ++i)
+            {
+                box[left * 5 + i] = box[right * 5 + i];
+            }
+            for(i = 0; i < 5; ++i)
+            {
+                box[right * 5 + i] = temp[i];
+            }
+
+            ++left;
+            --right;
+        }
+    }
+
+    if (right > start)
+    {
+        for(i = 0; i < 5; ++i)
+        {
+            temp[i] = box[start * 5 + i];
+        }
+        for(i = 0; i < 5; ++i)
+        {
+            box[start * 5 + i] = box[right * 5 + i];
+        }
+        for(i = 0; i < 5; ++i)
+        {
+            box[right * 5 + i] = temp[i];
+        }
+    }
+
+    if(start < right - 1)
+    {
+        detection_box_qsort(box, start, right - 1);
+    }
+    if(right + 1 < end)
+    {
+        detection_box_qsort(box, right + 1, end);
+    }
+}
+
+static void _init_box(vsi_nn_link_list_t *node)
+{
+    vsi_nn_fasterrcnn_box_t *box = NULL;
+    box = (vsi_nn_fasterrcnn_box_t *)node;
+    memset(box, 0, sizeof(vsi_nn_fasterrcnn_box_t));
+}
+
+static vsi_status _fasterrcnn_post_process
+    (
+    float *rois,
+    float *bbox,
+    float *cls,
+    vsi_nn_fasterrcnn_param_t *param,
+    vsi_nn_fasterrcnn_box_t **dets_box
+    )
+{
+    vsi_status status;
+    uint32_t i,j,k;
+    uint32_t rois_num,classes_num;
+    float *pred_boxes = NULL,*dets = NULL;
+    float *pdets = NULL, *ppred = NULL;
+    vsi_nn_fasterrcnn_box_t *box = NULL;
+    uint32_t *keep = NULL,num;
+    float score;
+
+    if(NULL == rois || NULL == bbox || NULL == cls || NULL == param)
+    {
+        return VSI_FAILURE;
+    }
+
+    status = VSI_FAILURE;
+    status = _unscale_roi(rois, param);
+    if(status != VSI_SUCCESS)
+    {
+        VSILOGE("unscale roi fail");
+        return status;
+    }
+
+    status = _bbox_transform_inv(rois, bbox, param, &pred_boxes);
+    if(status != VSI_SUCCESS)
+    {
+        VSILOGE("transform bbox fail");
+        return status;
+    }
+
+    rois_num = param->rois_num;
+    classes_num = param->classes_num;
+    dets = (float *)malloc(sizeof(float) * 5 * rois_num);
+    if(NULL == dets)
+    {
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    keep = NULL;
+    keep = (uint32_t *)malloc(sizeof(uint32_t) * rois_num);
+    if(NULL == keep)
+    {
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    /* i=1, skip background */
+    for(i=1; i<param->classes_num; i++)
+    {
+        /* pred_boxes{rois_num,84} */
+        pdets = dets;
+        ppred = pred_boxes + 4 * i;
+        for(j=0; j<rois_num; j++)
+        {
+            pdets[0] = ppred[0];
+            pdets[1] = ppred[1];
+            pdets[2] = ppred[2];
+            pdets[3] = ppred[3];
+            pdets[4] = cls[j*classes_num + i];
+
+            pdets += 5;
+            ppred += classes_num*4;
+        }
+
+        detection_box_qsort(dets, 0, rois_num - 1);
+
+        num = 0;
+        memset(keep, 0, sizeof(int32_t) * rois_num);
+        detection_box_nms(dets, param->nms_thresh, rois_num, keep, &num);
+
+        for(k=0; k<num; k++)
+        {
+            score = dets[keep[k]*5+4];
+            if(score > param->conf_thresh)
+            {
+                if(NULL != dets_box)
+                {
+                    box = (vsi_nn_fasterrcnn_box_t *)
+                        vsi_nn_LinkListNewNode(sizeof(vsi_nn_fasterrcnn_box_t), _init_box);
+                    box->score = dets[keep[k]*5+4];
+                    box->class_id = i;
+                    box->x1 = dets[keep[k]*5+0];
+                    box->y1 = dets[keep[k]*5+1];
+                    box->x2 = dets[keep[k]*5+2];
+                    box->y2 = dets[keep[k]*5+3];
+                    vsi_nn_LinkListPushStart(
+                        (vsi_nn_link_list_t **)dets_box,
+                        (vsi_nn_link_list_t *)box );
+                }
+            }
+        }
+    }
+
+final:
+    if(keep)free(keep);
+    if(dets)free(dets);
+    if(pred_boxes)free(pred_boxes);
+    return status;
+} /* _fasterrcnn_post_process() */
+
+static void _dump_boxes
+    (
+    vsi_nn_fasterrcnn_box_t **box,
+    vsi_nn_fasterrcnn_param_t *param
+    )
+{
+    vsi_nn_fasterrcnn_box_t *iter = *box;
+
+    while (iter)
+    {
+        if(param->classes)
+        {
+            VSILOGI(" classes[%s] score[%f] coordinate[%f %f %f %f]",
+                param->classes[iter->class_id],
+                iter->score,
+                iter->x1, iter->y1, iter->x2, iter->y2);
+        }
+        else
+        {
+            VSILOGI(" classes_id[%u] score[%f] coordinate[%f %f %f %f]",
+                iter->class_id,
+                iter->score,
+                iter->x1, iter->y1, iter->x2, iter->y2);
+        }
+
+
+        iter = (vsi_nn_fasterrcnn_box_t *)
+            vsi_nn_LinkListNext( (vsi_nn_link_list_t *)iter );
+    }
+}
+
+vsi_status vsi_nn_FasterRCNN_PostProcess
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_fasterrcnn_inputs_t *inputs,
+    vsi_nn_fasterrcnn_param_t *param,
+    vsi_nn_fasterrcnn_box_t **dets_box
+    )
+{
+    vsi_status status;
+    vsi_nn_fasterrcnn_inputs_t frcnn_inputs;
+    vsi_nn_fasterrcnn_param_t frcnn_param;
+    float *roi_data,*bbox_data,*cls_data;
+
+    if(NULL == graph)
+    {
+        return VSI_FAILURE;
+    }
+
+    status = VSI_FAILURE;
+    memset(&frcnn_inputs, 0, sizeof(vsi_nn_fasterrcnn_inputs_t));
+    memset(&frcnn_param, 0, sizeof(vsi_nn_fasterrcnn_param_t));
+
+    if(NULL == param)
+    {
+        status = _fill_fasterrcnn_param(graph, &frcnn_param);
+        if(status != VSI_SUCCESS)
+        {
+            VSILOGE("Auto fill faster-rcnn parameters fail");
+            return status;
+        }
+    }
+    else
+    {
+        memcpy(&frcnn_param, param, sizeof(vsi_nn_fasterrcnn_param_t));
+    }
+
+    if(NULL == inputs)
+    {
+        status = _fill_fasterrcnn_inputs(graph, &frcnn_param, &frcnn_inputs);
+        if(status != VSI_SUCCESS)
+        {
+            VSILOGE("Auto fill faster-rcnn inputs fail");
+            return status;
+        }
+    }
+    else
+    {
+        memcpy(&frcnn_inputs, inputs, sizeof(vsi_nn_fasterrcnn_inputs_t));
+    }
+
+    roi_data = NULL,bbox_data = NULL, cls_data = NULL;
+    roi_data  = vsi_nn_ConvertTensorToFloat32Data(graph, frcnn_inputs.rois);
+    bbox_data = vsi_nn_ConvertTensorToFloat32Data(graph, frcnn_inputs.bbox);
+    cls_data  = vsi_nn_ConvertTensorToFloat32Data(graph, frcnn_inputs.cls);
+
+    status = _fasterrcnn_post_process(
+        roi_data,
+        bbox_data,
+        cls_data,
+        &frcnn_param,
+        dets_box
+        );
+    if(status != VSI_SUCCESS)
+    {
+        goto final;
+    }
+
+    _dump_boxes(dets_box, &frcnn_param);
+
+    status = VSI_SUCCESS;
+final:
+    if(roi_data)free(roi_data);
+    if(bbox_data)free(bbox_data);
+    if(cls_data)free(cls_data);
+    return status;
+}
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index 46a5409..4ce42c9 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -455,6 +455,16 @@ static _op_param_gen_t s_op_gen[] =
     /* SCATTER_ELEMENTS */      NULL,
     /* PRE_PROCESS_YUV422 */    NULL,
     /* BUCKETIZE */             NULL,
+    /* GLOBALLPPOOL */          NULL,
+    /* AVG_POOL3D */            NULL,
+    /* ATAN */                  NULL,
+    /* ATANH */                 NULL,
+    /* ACOSH */                 NULL,
+    /* MAXUNPOOL */             NULL,
+    /* REVERSESEQUENCE */       NULL,
+    /* INVERSE_SIGMOID */       NULL,
+    /* GRID_SAMPLE */           NULL,
+    /* LPNORM */                NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c b/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c
new file mode 100644
index 0000000..f644649
--- /dev/null
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dlfcn.c
@@ -0,0 +1,59 @@
+#include "vsi_nn_log.h"
+#include "utils/vsi_nn_dlfcn.h"
+
+#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
+void * vsi_nn_dlopen( const char *file, int mode )
+{
+    return NULL;
+}
+
+int vsi_nn_dlclose( void *handle )
+{
+    return -1;
+}
+
+__declspec(noinline)
+void* vsi_nn_dlsym( void *handle, const char *name )
+{
+    return NULL;
+}
+
+char *vsi_nn_dlerror( void )
+{
+    return "\0";
+}
+#else
+
+void* vsi_nn_dlsym
+    (
+    void *handle,
+    const char *name
+    )
+{
+    return dlsym( handle, name );
+}
+
+int vsi_nn_dlclose
+    (
+    void *handle
+    )
+{
+    return dlclose( handle );
+}
+
+void* vsi_nn_dlopen
+    (
+    const char *file,
+    int mode
+    )
+{
+    return dlopen( file, mode );
+}
+
+char * vsi_nn_dlerror(void)
+{
+    return dlerror();
+}
+
+#endif
+
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
index 6f69616..18575b7 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@@ -273,6 +273,9 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm
         case I16:
             return vsi_nn_dtype_convert_float_to_quantize_symm16(
                     buffer, size, scale, zero_point, (int16_t*)out_buffer );
+        case U16:
+            return vsi_nn_dtype_convert_float_to_quantize_asymm16(
+                    buffer, size, scale, zero_point, (uint16_t*)out_buffer );
         default:
             VSILOGE("Don't support convert float to asymm quant %d.", dtype);
             break;
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index 21c8498..e6a766f 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -782,7 +782,7 @@ uint32_t vsi_nn_ShapeToString
 #define _NOT_PRINT_FMT (1)
     vsi_size_t s;
     uint32_t count;
-    const char * all_fmt[] = {" %d,", "%d_" };
+    const char * all_fmt[] = {" %"VSI_SIZE_T_SPECIFIER",", "%"VSI_SIZE_T_SPECIFIER"_" };
     const char * fmt;
     if( NULL == shape || NULL == buf
         || dim_num == 0 || buf_sz == 0 )
@@ -1528,3 +1528,155 @@ vsi_bool vsi_nn_is_3d_tensor
         return FALSE;
     }
 }
+
+vsi_bool vsi_nn_is_stream_process_supported_types
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t** inputs,
+    size_t input_num
+    )
+{
+    size_t i = 0;
+
+    if ( graph->ctx->config.support_stream_processor == 0 )
+    {
+        return FALSE;
+    }
+
+    if ( graph->ctx->config.sp_exec_count == 0 )
+    {
+        return FALSE;
+    }
+
+    for (i = 0; i < input_num; i++)
+    {
+        if (inputs && input_num > 0 && inputs[i] &&
+            ( inputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
+              inputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_UINT32))
+        {
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+}
+
+vsi_bool vsi_nn_is_sp_supported_broadcast
+    (
+        vsi_nn_graph_t*   graph,
+        vsi_nn_tensor_t** inputs,
+        uint32_t          input_num,
+        vsi_nn_tensor_t*  output
+    )
+{
+typedef enum
+{
+    VSI_BROADCAST_BITS_NONE          = 0x0,
+    VSI_BROADCAST_BITS_ON_AXIS_0     = 0x1,
+    VSI_BROADCAST_BITS_ON_AXIS_1     = 0x2,
+    VSI_BROADCAST_BITS_ON_AXIS_2     = 0x4,
+    VSI_BROADCAST_BITS_ON_AXIS_3     = 0x8,
+    VSI_BROADCAST_BITS_ON_AXIS_4     = 0x10,
+    VSI_BROADCAST_BITS_ON_AXIS_5     = 0x20,
+    VSI_BROADCAST_BITS_ON_AXIS_10    = 0x3,
+    VSI_BROADCAST_BITS_ON_AXIS_210   = 0x7,
+    VSI_BROADCAST_BITS_ON_AXIS_21    = 0x6,
+} vsi_broadcast_bits_status_e;
+#define _PACK_ELTWISE_SP_KEY(A_BROADCAST, B_BROADCAST) \
+    ( (A_BROADCAST) | (B_BROADCAST << 8))
+    int32_t broadcast_bits_0 = VSI_BROADCAST_BITS_NONE;
+    int32_t broadcast_bits_1 = VSI_BROADCAST_BITS_NONE;
+    uint32_t i = 0;
+    uint32_t k = 0;
+    uint32_t rank = output->attr.dim_num;
+    vsi_bool is_broadcast = FALSE;
+    vsi_bool support = TRUE;
+    uint32_t key = 0;
+    vsi_broadcast_bits_status_e broadcast_bits_status[VSI_NN_MAX_DIM_NUM] = {VSI_BROADCAST_BITS_NONE};
+
+    if (vsi_nn_is_stream_process_supported_types(graph, inputs, input_num) == FALSE)
+    {
+        return FALSE;
+    }
+
+    for ( k = 1; k < input_num; k++ )
+    {
+        vsi_nn_tensor_t *input0 = inputs[k - 1];
+        vsi_nn_tensor_t *input1 = inputs[k];
+        uint32_t rank0 = input0->attr.dim_num;
+        uint32_t rank1 = input1->attr.dim_num;
+
+        broadcast_bits_status[0] = VSI_BROADCAST_BITS_NONE;
+        broadcast_bits_status[1] = VSI_BROADCAST_BITS_NONE;
+
+        for ( i = 0; i < rank; i++ )
+        {
+            vsi_size_t sz0 = i < rank0 ? input0->attr.size[i] : 1;
+            vsi_size_t sz1 = i < rank1 ? input1->attr.size[i] : 1;
+
+            if (sz0 != sz1)
+            {
+                broadcast_bits_0 |= sz0 == 1 ? (1 << i) : 0;
+                broadcast_bits_1 |= sz1 == 1 ? (1 << i) : 0;
+
+                is_broadcast = vx_true_e;
+            }
+        }
+
+        broadcast_bits_status[0] = broadcast_bits_0;
+        broadcast_bits_status[1] = broadcast_bits_1;
+
+        if (broadcast_bits_status[0] == VSI_BROADCAST_BITS_ON_AXIS_1 &&
+            broadcast_bits_status[1] == VSI_BROADCAST_BITS_NONE)
+        {
+            vsi_size_t channel = rank0 > 2 ? input0->attr.size[2] : 1;
+
+            if (channel == 1)
+            {
+                broadcast_bits_status[0] = VSI_BROADCAST_BITS_ON_AXIS_21;
+            }
+        }
+        else if (broadcast_bits_status[1] == VSI_BROADCAST_BITS_ON_AXIS_1 &&
+                 broadcast_bits_status[0] == VSI_BROADCAST_BITS_NONE)
+        {
+            vx_size channel = rank0 > 2 ? input0->attr.size[2] : 1;
+
+            if (channel == 1)
+            {
+                broadcast_bits_status[1] = VSI_BROADCAST_BITS_ON_AXIS_21;
+            }
+        }
+
+        key = _PACK_ELTWISE_SP_KEY(broadcast_bits_status[0], broadcast_bits_status[1]);
+
+        switch ( key )
+        {
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_NONE,        VSI_BROADCAST_BITS_NONE):
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_ON_AXIS_2,   VSI_BROADCAST_BITS_NONE):
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_NONE,        VSI_BROADCAST_BITS_ON_AXIS_2):
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_ON_AXIS_21,  VSI_BROADCAST_BITS_NONE):
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_NONE,        VSI_BROADCAST_BITS_ON_AXIS_21):
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_ON_AXIS_210, VSI_BROADCAST_BITS_NONE):
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_NONE,        VSI_BROADCAST_BITS_ON_AXIS_210):
+            break;
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_ON_AXIS_0,   VSI_BROADCAST_BITS_NONE):
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_ON_AXIS_10,  VSI_BROADCAST_BITS_NONE):
+            support = support && (vsi_nn_TypeGetBits(input0->attr.dtype.vx_type) != 4);
+            break;
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_NONE,        VSI_BROADCAST_BITS_ON_AXIS_0):
+        case _PACK_ELTWISE_SP_KEY(VSI_BROADCAST_BITS_NONE,        VSI_BROADCAST_BITS_ON_AXIS_10):
+            support = support && (vsi_nn_TypeGetBits(input1->attr.dtype.vx_type) != 4);
+            break;
+        default:
+            support = !is_broadcast;
+            break;
+        }
+
+        if (support == FALSE)
+        {
+            break;
+        }
+    }
+
+    return support;
+}
diff --git a/src/tim/vx/internal/src/vip/virtual_device_private.h b/src/tim/vx/internal/src/vip/virtual_device_private.h
index a1fb7b6..d3d7d1e 100644
--- a/src/tim/vx/internal/src/vip/virtual_device_private.h
+++ b/src/tim/vx/internal/src/vip/virtual_device_private.h
@@ -21,14 +21,13 @@
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
-#ifndef _VIP_VIRTUAL_DEVICE_PRIVATE_H
+#ifndef _VIP_VIRTUAL_DEVICE_PRIVARE_H
 #define _VIP_VIRTUAL_DEVICE_PRIVATE_H
 
 #include <memory>
 #include <queue>
 #include <vector>
 #include <map>
-#include <array>
 #include <thread>
 #include <iostream>
 #include <mutex>
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index a8a99d4..7d7636f 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -60,13 +60,16 @@ static vsi_status query_hardware_caps
     memset(&paramExt2, 0, sizeof(vx_hardware_caps_params_ext2_t));
     status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt2),
                 sizeof(vx_hardware_caps_params_ext2_t));
-    context->config.support_stream_processor = paramExt.supportStreamProcessor;
-    context->config.sp_exec_count = paramExt2.streamProcessorExecCount;
-    context->config.sp_vector_depth = paramExt2.streamProcessorVectorSize;
-    if (context->config.sp_exec_count > 0)
+    if (context->options.enable_stream_processor)
     {
-        context->config.sp_per_core_vector_depth =
-            context->config.sp_vector_depth / context->config.sp_exec_count;
+        context->config.support_stream_processor = paramExt.supportStreamProcessor;
+        context->config.sp_exec_count = paramExt2.streamProcessorExecCount;
+        context->config.sp_vector_depth = paramExt2.streamProcessorVectorSize;
+        if (context->config.sp_exec_count > 0)
+        {
+            context->config.sp_per_core_vector_depth =
+                context->config.sp_vector_depth / context->config.sp_exec_count;
+        }
     }
 #endif
 
@@ -141,6 +144,13 @@ static vsi_status vsi_nn_initOptions
         options->enable_dataconvert_optimize = atoi(env_s);
     }
 
+    env_s = NULL;
+    options->enable_stream_processor = 1;
+    if (vsi_nn_getEnv("VSI_VX_ENABLE_STREAM_PROCESSOR", &env_s) && env_s)
+    {
+        options->enable_stream_processor = atoi(env_s);
+    }
+
     return VSI_SUCCESS;
 }
 
@@ -164,13 +174,14 @@ vsi_nn_context_t vsi_nn_CreateContext
 
     memset(context, 0, sizeof(struct _vsi_nn_context_t));
     context->c = c;
-    if(query_hardware_caps(context) != VSI_SUCCESS)
+
+    if (vsi_nn_initOptions(&context->options) != VSI_SUCCESS)
     {
         vsi_nn_ReleaseContext(&context);
         return NULL;
     }
 
-    if (vsi_nn_initOptions(&context->options) != VSI_SUCCESS)
+    if (query_hardware_caps(context) != VSI_SUCCESS)
     {
         vsi_nn_ReleaseContext(&context);
         return NULL;
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index cf44888..bbfdabc 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -37,7 +37,6 @@
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_version.h"
 #include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_vdata.h"
 #include "utils/vsi_nn_map.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_graph_optimization.h"
@@ -955,7 +954,7 @@ static vsi_nn_tensor_id_t _add_tensor
         }
         else
         {
-            tensor = vsi_nn_CreateVDataTensor( graph, data, attr );
+            VSILOGE("VDATA mode is no longer be supported!");
         }
     }
     else if( NULL != data )
@@ -1657,7 +1656,7 @@ void vsi_nn_DumpGraphToJson
     )
 {
 #define _SHAPE_BUF_SIZE 64
-    uint32_t i,j;
+    uint32_t i, j, data_input_count = 0;
     FILE *fp;
     vsi_nn_tensor_rel_t *tensor_ref, *tio;
     vsi_nn_tensor_rel_table_t *table;
@@ -1686,6 +1685,15 @@ void vsi_nn_DumpGraphToJson
     }
 
     fprintf(fp, "{\n");
+
+    /* dump meta data */
+    fprintf(fp, "\t\"MetaData\":{\n");
+    fprintf(fp, "\t\t\"Name\": \"Ovxlib_Debug_Graph\",\n");
+    fprintf(fp, "\t\t\"AcuityVersion\": \"UNKNOWN\",\n");
+    fprintf(fp, "\t\t\"Platform\": \"UNKNOWN\",\n");
+    fprintf(fp, "\t\t\"Org_Platform\": \"UNKNOWN\"\n");
+    fprintf(fp, "\t},\n");
+
     fprintf(fp, "\t\"Layers\":{\n");
     for(i = 0; i < graph->node_num; i++)
     {
@@ -1702,6 +1710,7 @@ void vsi_nn_DumpGraphToJson
                 tio = &tensor_ref[node->input.tensors[j]];
                 if(NULL == vsi_nn_GetTensor(graph, node->input.tensors[j]))
                 {
+                    /* this path may cause netron display abnormally */
                     if(j == node->input.num - 1)
                     {
                         fprintf(fp, "\"not used\" ");
@@ -1732,12 +1741,13 @@ void vsi_nn_DumpGraphToJson
                     {
                         if(j == node->input.num - 1)
                         {
-                            fprintf(fp, "\"datainput_%u:out0\" ", j);
+                            fprintf(fp, "\"@data_input_uid_%u:out0\" ", graph->node_num + data_input_count + 1);
                         }
                         else
                         {
-                            fprintf(fp, "\"datainput_%u:out0\", ", j);
+                            fprintf(fp, "\"@data_input_uid_%u:out0\", ", graph->node_num + data_input_count + 1);
                         }
+                        data_input_count += 1;
                     }
                 }
             }
@@ -1797,13 +1807,44 @@ void vsi_nn_DumpGraphToJson
             }
             fprintf(fp, " ]\n\t\t}");
 
-            if(i != graph->node_num - 1)
+            if(i != graph->node_num - 1 || data_input_count > 0)
             {
                 fprintf(fp, ",");
             }
             fprintf(fp, "\n");
         }
     }
+
+    /* dump all norm_tensor and const tensor into json as input layer */
+    for (i = 0; i < data_input_count; i++)
+    {
+        fprintf(fp, "\t\t\"data_input_uid_%u\":{\n\t\t\t\"op\": \"%s\",\n",
+            graph->node_num + i + 1, "DATA_INPUT");
+
+        /* dump inputs */
+        fprintf(fp, "\t\t\t\"inputs\": [ ");
+
+        /* dump input shape */
+        fprintf(fp, "],\n\t\t\t\"inut_shape\": [ ");
+        fprintf(fp, "[%s ]", "");
+
+        /* dump output */
+        fprintf(fp, " ],\n\t\t\t\"outputs\": [ ");
+        fprintf(fp, "\"out%u\" ", 0);
+
+        //output shape
+        fprintf(fp, "],\n\t\t\t\"output_shape\": [ ");
+        fprintf(fp, "[%s ]", "");
+
+        fprintf(fp, " ]\n\t\t}");
+
+        if (i != data_input_count - 1)
+        {
+            fprintf(fp, ",");
+        }
+        fprintf(fp, "\n");
+    }
+
     fprintf(fp, "\t}\n}\n");
 
     vsi_nn_ReleaseTensorRelevance(graph, tensor_ref);
@@ -1839,6 +1880,8 @@ vsi_status vsi_nn_TrySetupCompleteSignalNode
         signal_tensor_attr.dim_num = 2;
         signal_tensor_attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
         signal_tensor_attr.vtl = FALSE;
+        signal_tensor_attr.is_created_from_handle = TRUE;
+        signal_tensor_attr.is_handle_malloc_by_ovxlib = FALSE;
         /* Setup signal node */
         signal_node = vsi_nn_CreateNode( graph, VSI_NN_OP_EXTRA_ENDING );
         TEST_CHECK_PTR( signal_node, final );
diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
index 855189b..05b2d2f 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@@ -340,6 +340,11 @@ static vsi_status _add_graph_dataconvert_for_int8
                _add_dataconvert_node(graph, dataconvert_idx ++, VSI_NN_OPTIMIZE_FORWARD,
                    input_nodes[i], nodes_count, id, output);
             }
+            if (input_nodes[i] != NULL)
+            {
+                free(input_nodes[i]);
+                input_nodes[i] = NULL;
+            }
         }
 
         if(input_nodes)
@@ -491,13 +496,17 @@ vsi_status vsi_nn_CopyDataToRawTensor
         vxSwapTensorHandle( tensor, NULL, (void **)&ptr);
         if ( ptr == NULL )
         {
-            VSILOGE("vxSwapTensorHandle fail.");
+            VSILOGE("Tensor handle is NULL.");
             return VSI_FAILURE;
         }
         memcpy( ptr, data, vsi_nn_GetTensorSize(attr.size, attr.dim_num,
                     attr.dtype.vx_type));
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+        status = vxFlushHandle((vx_reference)tensor);
+#else
         status = vxSwapTensorHandle( tensor, ptr, NULL );
         status |= vxFlushHandle( (vx_reference)tensor );
+#endif
     }
     else
     {
@@ -591,23 +600,22 @@ static vx_tensor _create_const_raw_tensor
             }
             else
             {
-                attr.is_handle_malloc_by_ovxlib = FALSE;
+                if (TRUE == attr.is_handle_malloc_by_ovxlib)
+                {
+                    VSILOGE("Data allocated by OVXLIB should not be shared by other OVXLIB tensor.");
+                    tensor = NULL;
+                    goto final;
+                }
                 if (!vsi_nn_IsBufferAligned(data, align_start_size))
                 {
                     VSILOGE( "vsi_nn_IsBufferAligned is FALSE." );
-                    if( scales )
-                    {
-                        free( scales );
-                    }
-                    if (zeroPoints)
-                    {
-                        free( zeroPoints );
-                    }
-                    return NULL;
+                    tensor = NULL;
+                    goto final;
                 }
             }
             if( data )
             {
+                vsi_status status = VSI_FAILURE;
 #ifdef VSI_40BIT_VA_SUPPORT
                 {
                     vx_size size[_cnt_of_array(attr.size)] = {0};
@@ -658,7 +666,12 @@ static vx_tensor _create_const_raw_tensor
                     VSILOGE( "Create vx tensor fail." );
                     goto final;
                 }
-                vxFlushHandle( (vx_reference)tensor );
+                status = vxFlushHandle( (vx_reference)tensor );
+                if (VSI_SUCCESS != status)
+                {
+                    VSILOGE("Flush handle fail.");
+                    goto final;
+                }
             }
         }
     }
diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c
index d80c80d..24265a1 100644
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@@ -36,7 +36,6 @@
 #include "vsi_nn_test.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_vdata.h"
 #include "utils/vsi_nn_map.h"
 
 /**********************************************************
diff --git a/src/tim/vx/internal/src/vsi_nn_kernel_prv.h b/src/tim/vx/internal/src/vsi_nn_kernel_prv.h
new file mode 100644
index 0000000..fa01a5e
--- /dev/null
+++ b/src/tim/vx/internal/src/vsi_nn_kernel_prv.h
@@ -0,0 +1,62 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+/** @file */
+#ifndef _VSI_NN_KERNEL_PRV_H
+#define _VSI_NN_KERNEL_PRV_H
+
+/*-------------------------------------------
+                Includes
+-------------------------------------------*/
+#include "kernel/vsi_nn_kernel.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Set sp node name
+ *
+ * @param[in] node Node handle
+ * @param[in] kernel_name Kernel name.
+ * @return VSI_SUCCESS on success, or appropriate error code otherwise.
+ */
+vsi_status vsi_nn_set_sp_kernel_name
+    (
+        vsi_nn_kernel_node_t node,
+        char* kernel_name
+    );
+
+vsi_bool vsi_nn_is_sp_supported_broadcast
+    (
+        vsi_nn_graph_t*   graph,
+        vsi_nn_tensor_t** inputs,
+        uint32_t          input_num,
+        vsi_nn_tensor_t*  output
+    );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
index 0c870bc..b3e2ef1 100644
--- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
+++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
@@ -200,6 +200,11 @@ static _node_template s_template[] =
     /* MOD */                   NULL,
     /* LPPOOL */                NULL,
     /* PRE_PROCESS_YUV422 */    NULL,
+    /* GLOBALLPPOOL */          NULL,
+    /* AVG_POOL3D */            NULL,
+    /* MAXUNPOOL */             NULL,
+    /* REVERSESEQUENCE */       NULL,
+    /* LPNORM */                NULL,
 };
 //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );
 
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index 10f25ac..63c80f1 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -88,7 +88,8 @@ static void _create_multi_norm_tensors
             multi_input_tensors[1] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL);
             multi_input_tensors[2] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL);
         }
-        else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12)
+        else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 ||
+                *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21)
         {
             uv_input_attr = *input_attr;
             uv_input_attr.size[0] = w;
@@ -289,6 +290,21 @@ static void _set_preproc_node_input_attr
             input_attr->size[2] = 1;
         }
     }
+    if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422 ||
+        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422)
+    {
+        if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC)
+        {
+            input_attr->size[0] = 2*input_attr->size[1];
+            input_attr->size[1] = input_attr->size[2];
+            input_attr->size[2] = 1;
+        }
+        else
+        {
+            input_attr->size[0] = 2*input_attr->size[0];
+            input_attr->size[2] = 1;
+        }
+    }
 } /*_set_preproc_node_input_attr() */
 
 static void _set_preproc_node_output_attr
@@ -407,7 +423,8 @@ static void _get_org_graph_inputs
                 {
                     i += 2 ;
                 }
-                else if(nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12)
+                else if(nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 ||
+                        nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 )
                 {
                     i += 1;
                 }
@@ -506,7 +523,8 @@ vsi_status vsi_nn_add_single_preproc_node
     {
         node_input_num = 3;
     }
-    else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12)
+    else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 ||
+             *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21)
     {
         node_input_num = 2;
     }
@@ -552,6 +570,7 @@ vsi_status vsi_nn_add_single_preproc_node
     /* Create new norm and virtual tensors */
     if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 ||
         *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 ||
+        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 ||
         *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 ||
         *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
     {
diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
index b3f8800..9466d3d 100644
--- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
+++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
@@ -116,6 +116,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc
     vsi_size_t* reshape_in_size = NULL;
     uint32_t* permute_in_perm = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
@@ -133,7 +134,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc
     tmp_inode->node->nn_param.reshape2.dim_num = 4;
     tmp_inode->inputs[0] = input;
     tmp_inode->outputs[0] = tensor1->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
 
     if( multi_batch )
     {
@@ -168,6 +169,10 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc
 
         tensor1 = tensor0;
     }
+    if (!ret)
+    {
+        tensor1 = NULL;
+    }
 
     return tensor1;
 }
@@ -189,6 +194,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc
     uint32_t* permute_in_perm = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
     vsi_nn_tensor_t* tensor = input;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
@@ -240,7 +246,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc
     tmp_inode->node->nn_param.reshape2.dim_num = 2;
     tmp_inode->inputs[0] = tensor;
     tmp_inode->outputs[0] = tensor2->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
+    if (!ret)
+    {
+        tensor2 = NULL;
+    }
 
     return tensor2;
 }
@@ -332,6 +342,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc
     vsi_nn_internal_tensor_t* tensor1 = NULL;
     vsi_nn_internal_tensor_t* tensor2 = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     tensor = bias;
@@ -353,7 +364,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc
     tmp_inode->inputs[1] = weight;
     tmp_inode->inputs[2] = tensor;
     tmp_inode->outputs[0] = tensor2->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
+    if (!ret)
+    {
+        tensor2 = NULL;
+    }
 
     return tensor2;
 }
@@ -376,6 +391,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc
     vsi_nn_internal_tensor_t* tensor2 = NULL;
     vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     tensor = bias;
@@ -410,7 +426,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc
     tmp_inode->inputs[1] = reshaped_weight_tensor->t;
     tmp_inode->inputs[2] = tensor;
     tmp_inode->outputs[0] = tensor2->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
+    if (!ret)
+    {
+        tensor2 = NULL;
+    }
 
     return tensor2;
 }
@@ -470,6 +490,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu
     vsi_nn_internal_tensor_t* tensor2 = NULL;
     vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     tensor = bias;
@@ -509,7 +530,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu
     tmp_inode->inputs[1] = reshaped_weight_tensor->t;
     tmp_inode->inputs[2] = tensor;
     tmp_inode->outputs[0] = tensor2->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
+    if (!ret)
+    {
+        tensor2 = NULL;
+    }
 
     return tensor2;
 }
@@ -526,6 +551,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_add
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t* tensor1 = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
@@ -536,8 +562,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_add
     tmp_inode->inputs[0] = input1;
     tmp_inode->inputs[1] = input2;
     tmp_inode->outputs[0] = tensor1->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
-
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
+    if (!ret)
+    {
+        tensor1 = NULL;
+    }
     return tensor1;
 }
 
@@ -578,6 +607,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_activation
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t* tensor1 = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
@@ -589,7 +619,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_activation
     tmp_inode->node->nn_param.tanh.scale_a = 1.0f;
     tmp_inode->node->nn_param.tanh.scale_b = 1.0f;
     tmp_inode->outputs[0] = tensor1->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
+    if (!ret)
+    {
+        tensor1 = NULL;
+    }
 
     return tensor1;
 }
@@ -606,6 +640,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major
     uint32_t* permute_in_perm = NULL;
     vsi_nn_internal_tensor_t* output_tensor = NULL;
     vsi_nn_internal_node_t* curr = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
 
@@ -635,7 +670,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major
     {
         curr->outputs[0] = output;
     }
-    vsi_nn_internal_setup_node(self, curr);
+    ret = vsi_nn_internal_setup_node(self, curr);
+    if (!ret)
+    {
+        output_tensor = NULL;
+    }
 
     return output_tensor;
 }
@@ -722,6 +761,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t* output_tensor = NULL;
     vsi_size_t *reshape_split_size = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     /* reshape for split output */
@@ -738,7 +778,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output
     curr->node->nn_param.reshape2.dim_num = 2;
     curr->inputs[0] = input;
     curr->outputs[0] = output_tensor->t;
-    vsi_nn_internal_setup_node( self, curr );
+    ret = vsi_nn_internal_setup_node( self, curr );
+    if (!ret)
+    {
+        output_tensor = NULL;
+    }
 
     return output_tensor;
 }
@@ -755,6 +799,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t* output_tensor = NULL;
     vsi_size_t* reshape_grucell_output_size = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
 
@@ -773,7 +818,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output
     curr->node->nn_param.reshape2.dim_num = 3;
     curr->inputs[0] = input;
     curr->outputs[0] = output_tensor->t;
-    vsi_nn_internal_setup_node( self, curr );
+    ret = vsi_nn_internal_setup_node( self, curr );
+    if (!ret)
+    {
+        output_tensor = NULL;
+    }
 
     return output_tensor;
 }
@@ -791,6 +840,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_binary_operator
     vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t* output_tensor = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
@@ -803,7 +853,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_binary_operator
     tmp_inode->inputs[0] = operand1;
     tmp_inode->inputs[1] = operand2;
     tmp_inode->outputs[0] = output_tensor->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
+    ret = vsi_nn_internal_setup_node(self, tmp_inode);
+    if (!ret)
+    {
+        output_tensor = NULL;
+    }
 
     return output_tensor;
 }
@@ -823,6 +877,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl
     vsi_nn_internal_tensor_t* tmp_tensor = NULL;
     vsi_nn_internal_node_t* inode = NULL;
     int tensor_count = 1;
+    vsi_bool ret = FALSE;
 
     va_start(args, tensor);
 
@@ -851,7 +906,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_concat_impl
     va_end(args);
     inode->outputs[0] = tmp_tensor->t;
 
-    vsi_nn_internal_setup_node(self, inode);
+    ret = vsi_nn_internal_setup_node(self, inode);
+    if (!ret)
+    {
+        tmp_tensor = NULL;
+    }
 
     return tmp_tensor;
 }
@@ -920,6 +979,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_internal_tensor_t* tensor0 = NULL;
     vsi_size_t* reshape_in_size = NULL;
+    vsi_bool ret = FALSE;
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(vsi_size_t));
@@ -941,7 +1001,12 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape
         tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
         curr->outputs[0] = tensor0->t;
     }
-    vsi_nn_internal_setup_node(self, curr);
+    ret = vsi_nn_internal_setup_node(self, curr);
+    if (!ret)
+    {
+        tensor0 = NULL;
+    }
+
 
     return tensor0;
 }
@@ -959,6 +1024,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_internal_tensor_t* tensor0 = NULL;
     uint32_t i = 0, * permute_in_perm = NULL;
+    vsi_bool ret = FALSE;
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0);
     permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(curr,
@@ -983,7 +1049,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute
         tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
         curr->outputs[0] = tensor0->t;
     }
-    vsi_nn_internal_setup_node(self, curr);
+    ret = vsi_nn_internal_setup_node(self, curr);
+    if (!ret)
+    {
+        tensor0 = NULL;
+    }
 
     return tensor0;
 }
@@ -999,6 +1069,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy
 {
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_internal_tensor_t* tensor0 = NULL;
+    vsi_bool ret = FALSE;
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
     curr->inputs[0] = input_tensor;
@@ -1018,7 +1089,11 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tensor_copy
         tensor0 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
         curr->outputs[0] = tensor0->t;
     }
-    vsi_nn_internal_setup_node(self, curr);
+    ret = vsi_nn_internal_setup_node(self, curr);
+    if (!ret)
+    {
+        tensor0 = NULL;
+    }
 
     return tensor0;
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index c931dd6..0710a62 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -31,7 +31,9 @@
 #include "vsi_nn_graph.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
+#include "vsi_nn_tensor_util_prv.h"
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_test.h"
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_util.h"
@@ -91,7 +93,7 @@ static void print_tensor
     char *ext_str
     )
 {
-#define _SHAPE_BUF_SZ   (64)
+#define _SHAPE_BUF_SZ   (128)
 #define _EXT_ATTR_BUF_SZ   (64)
 #define _ATTR_BUF_SZ   (64)
     int count;
@@ -106,7 +108,7 @@ static void print_tensor
     }
     vsi_nn_ShapeToString( tensor->attr.size, tensor->attr.dim_num,
         shape, _SHAPE_BUF_SZ, TRUE );
-    vsi_nn_FormatToString( tensor, format, _SHAPE_BUF_SZ );
+    vsi_nn_FormatToString( tensor, format, _ATTR_BUF_SZ );
 
     /* Process quantize parameters */
     switch( tensor->attr.dtype.qnt_type )
@@ -145,22 +147,24 @@ static void print_tensor
 
     if(ext_str)
     {
-        VSILOGD("%s id[%4u] vtl[%d] const[%d] shape[%-18s] fmt[%s] qnt[%s]",
+        VSILOGD("%s id[%4u] vtl[%d] const[%d] shape[%-18s] is_scalar[%d] fmt[%s] qnt[%s]",
             ext_str,
             id,
             tensor->attr.vtl,
             tensor->attr.is_const,
             shape,
+            vsi_nn_GetTensorIsScalar(tensor),
             format,
             ext_attr);
     }
     else
     {
-        VSILOGD("id[%4u] vtl[%d] const[%d] shape[%-18s] fmt[%s] qnt[%s]",
+        VSILOGD("id[%4u] vtl[%d] const[%d] shape[%-18s] is_scalar[%d] fmt[%s] qnt[%s]",
             id,
             tensor->attr.vtl,
             tensor->attr.is_const,
             shape,
+            vsi_nn_GetTensorIsScalar(tensor),
             format,
             ext_attr);
     }
@@ -424,13 +428,6 @@ static vsi_bool _init_tensor
         vxReleaseWeightsBiasesParameter( &tensor->wb );
     }
 
-#if VX_STREAM_PROCESSOR_SUPPORT
-    if ( TRUE == tensor->attr.is_dummy )
-    {
-        tensor->t = vxCreateDummyTensor( graph->ctx->c,
-            (vsi_size_t)tensor->attr.dim_num, size_vxsize, (vsi_enum)tensor->attr.dtype.vx_type );
-    } else
-#endif
     if( TRUE == tensor->attr.is_created_from_handle )
     {
         vx_tensor_addressing addr = NULL;
@@ -453,28 +450,22 @@ static vsi_bool _init_tensor
             }
             else
             {
-                tensor->attr.is_handle_malloc_by_ovxlib = FALSE;
+                if (TRUE == tensor->attr.is_handle_malloc_by_ovxlib)
+                {
+                    VSILOGE("Data allocated by OVXLIB should not be shared by other OVXLIB tensors.");
+                    ret = FALSE;
+                    goto final;
+                }
                 if (!vsi_nn_IsBufferAligned(data, align_start_size))
                 {
                     VSILOGE( "vsi_nn_IsBufferAligned is FALSE." );
-                    if( scales )
-                    {
-                        free(scales);
-                    }
-                    if( zeroPoints )
-                    {
-                        free(zeroPoints);
-                    }
-                    if(null_zp)
-                    {
-                        free(null_zp);
-                        null_zp = NULL;
-                    }
-                    return FALSE;
+                    ret = FALSE;
+                    goto final;
                 }
             }
             if( data )
             {
+                vsi_status status = VSI_FAILURE;
 #ifdef VSI_40BIT_VA_SUPPORT
                 {
                     vx_size size_vxsize2[_cnt_of_array(tensor->attr.size)] = {0};
@@ -539,7 +530,16 @@ static vsi_bool _init_tensor
                     ret = FALSE;
                     goto final;
                 }
-                vxFlushHandle( (vx_reference)tensor->t );
+                status = vxFlushHandle( (vx_reference)tensor->t );
+                if (VSI_SUCCESS != status)
+                {
+                    VSILOGE("Flush handle fail.");
+                    ret = FALSE;
+                    goto final;
+                }
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+                _set_tensor_handle((vsi_nn_tensor_prv_t*)tensor, data);
+#endif
             }
         }
     }
@@ -552,6 +552,18 @@ static vsi_bool _init_tensor
     {
         tensor->t = vxCreateVirtualTensor2( graph->g,
             &params, sizeof( vx_tensor_create_params_t ) );
+
+        if ((!vsi_nn_IsGraphFastMode(graph))
+            && (tensor->t != NULL)
+            && (params.data_format == VX_TYPE_FLOAT32))
+        {
+
+            vx_enum precision = VX_TENSOR_PRECISION_HIGH;
+            vxSetTensorAttribute(tensor->t,
+                                      VX_TENSOR_PRECISION,
+                                      &precision,
+                                      sizeof(vx_enum));
+        }
     }
     if ( NULL == tensor->t )
     {
@@ -568,7 +580,12 @@ static vsi_bool _init_tensor
             vsi_nn_FillTensorWithValue( graph, tensor, 0.0f );
             if(tensor->attr.is_created_from_handle)
             {
-                vxFlushHandle( (vx_reference)tensor->t );
+                vsi_status status = vxFlushHandle( (vx_reference)tensor->t );
+                if (VSI_SUCCESS != status)
+                {
+                    ret = FALSE;
+                    goto final;
+                }
             }
         }
     }
@@ -619,26 +636,26 @@ static vsi_nn_tensor_t * _create_tensor
     vsi_nn_tensor_attr_t * attr
     )
 {
-    vsi_nn_tensor_t * tensor;
+    vsi_nn_tensor_prv_t * tensor;
 
     tensor = NULL;
     if( NULL == graph || NULL == graph->g || NULL == attr )
     {
-        return tensor;
+        return NULL;
     }
 
-    tensor = (vsi_nn_tensor_t *)malloc( sizeof( vsi_nn_tensor_t ) );
+    tensor = (vsi_nn_tensor_prv_t *)malloc( sizeof( vsi_nn_tensor_prv_t ) );
     //vsi_nn_UpdateTensorDims( attr );
 
     if( NULL != tensor )
     {
-        memset( tensor, 0, sizeof( vsi_nn_tensor_t ) );
-        memcpy( &tensor->attr, attr, sizeof( vsi_nn_tensor_attr_t ) );
-        tensor->is_swapped = FALSE;
+        memset( tensor, 0, sizeof( vsi_nn_tensor_prv_t ) );
+        memcpy( &tensor->pot.attr, attr, sizeof( vsi_nn_tensor_attr_t ) );
+        tensor->pot.is_swapped = FALSE;
         if( attr->dim_num != VSI_NN_DIM_AUTO )
         {
-            _init_tensor( graph, tensor, data);
-            if( NULL == tensor->t )
+            _init_tensor( graph, &tensor->pot, data);
+            if( NULL == tensor->pot.t )
             {
                 VSILOGE( "Create vx tensor fail." );
                 free( tensor );
@@ -646,7 +663,7 @@ static vsi_nn_tensor_t * _create_tensor
             }
         }
     }
-    return tensor;
+    return (vsi_nn_tensor_t*)tensor;
 }
 
 vsi_nn_tensor_t * vsi_nn_CreateTensor
@@ -666,14 +683,34 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorFromHandle
     vsi_nn_tensor_attr_t * attr
     )
 {
-    attr->is_created_from_handle = TRUE;
+    vsi_nn_tensor_t* ptensor = NULL;
 #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL
     if(attr->vsi_memory_type == VSI_MEMORY_TYPE_NONE || attr->vsi_memory_type == 0)
     {
         attr->vsi_memory_type = VSI_MEMORY_TYPE_HOST;
     }
 #endif
-    return _create_tensor(graph, data, attr);
+    if (TRUE != attr->is_created_from_handle)
+    {
+        VSILOGE("Could only create tensor with flag 'is_created_from_handle == TRUE'.");
+        ptensor = NULL;
+        goto final;
+    }
+    /* 'attr' should contain correct flag is_handle_malloc_by_ovxlib to indicate the if 'data' is
+    allocated by OVXLIB. And 'data' allocated by OVXLIB shouldn't be shared by other ovxlib tensors */
+    if (NULL != data && TRUE == attr->is_handle_malloc_by_ovxlib)
+    {
+        VSILOGE("Handle allocated by OVXLIB should not be shared by other OVXLIB tensors.");
+        ptensor = NULL;
+        goto final;
+    }
+    else
+    {
+        ptensor = _create_tensor(graph, data, attr);
+    }
+
+ final:
+    return ptensor;
 } /* vsi_nn_CreateTensorFromHandle() */
 
 vsi_nn_tensor_t * vsi_nn_CreateTensorWithDefault
@@ -797,29 +834,42 @@ void vsi_nn_ReleaseTensor
     vsi_nn_tensor_t ** tensor
     )
 {
-    vsi_nn_tensor_t * ptr;
-    ptr = (NULL != tensor) ? *tensor : NULL;
+    vsi_nn_tensor_prv_t * ptr;
+    ptr = (NULL != tensor) ? (vsi_nn_tensor_prv_t*)(*tensor) : NULL;
     if( NULL != ptr)
     {
-        uint8_t * handle = NULL;
-        if( NULL != ptr->t )
+        if( NULL != ptr->pot.t )
         {
-            if (ptr->attr.is_created_from_handle &&
-                ptr->attr.is_handle_malloc_by_ovxlib)
+            uint8_t* handle = NULL;
+            if (ptr->pot.attr.is_created_from_handle &&
+                ptr->pot.attr.is_handle_malloc_by_ovxlib)
             {
-                vxSwapTensorHandle( ptr->t, NULL, (void**)&handle);
+                vxSwapTensorHandle(ptr->pot.t, NULL, (void**)&handle);
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+                if(handle != _get_tensor_handle(ptr))
+                {
+                    VSILOGE("Tensor handle maybe swapped by accident!");
+                }
+#endif
                 if ( handle == NULL )
                 {
-                    VSILOGE("vxSwapTensorHandle fail.");
+                    VSILOGE("Tensor handle is NULL.");
                     return;
                 }
             }
-            vxReleaseTensor( &ptr->t );
-            if (handle) vsi_nn_FreeAlignedBuffer(handle);
+            vxReleaseTensor( &ptr->pot.t );
+            if (handle)
+            {
+                vsi_nn_FreeAlignedBuffer(handle);
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+                handle = NULL;
+                _set_tensor_handle(ptr, NULL);
+#endif
+            }
         }
 
-        if (ptr->wb) {
-            vxReleaseWeightsBiasesParameter(&ptr->wb);
+        if (ptr->pot.wb) {
+            vxReleaseWeightsBiasesParameter(&ptr->pot.wb);
         }
 
         free( ptr );
@@ -961,10 +1011,15 @@ float * vsi_nn_ConvertTensorToFloat32Data
 
     if( tensor->attr.is_created_from_handle )
     {
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+        tensor_data = _get_tensor_handle((vsi_nn_tensor_prv_t*)tensor);
+        vxInvalidateHandleVSI((vx_reference)tensor->t);
+#else
         vxSwapTensorHandle(tensor->t, NULL, (void**)&tensor_data);
+#endif
         if ( tensor_data == NULL )
         {
-            VSILOGE("vxSwapTensorHandle fail.");
+            VSILOGE("Tensor handle is NULL.");
             if( data )
             {
                 free( data );
@@ -1023,10 +1078,15 @@ uint8_t * vsi_nn_ConvertTensorToData
     if( data && tensor->attr.is_created_from_handle )
     {
         uint8_t* tensor_data = NULL;
-        vxSwapTensorHandle( tensor->t, NULL, (void **)&tensor_data );
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+        tensor_data = _get_tensor_handle((vsi_nn_tensor_prv_t*)tensor);
+        vxInvalidateHandleVSI((vx_reference)tensor->t);
+#else
+        vxSwapTensorHandle(tensor->t, NULL, (void**)&tensor_data);
+#endif
         if ( tensor_data == NULL )
         {
-            VSILOGE("vxSwapTensorHandle fail.");
+            VSILOGE("Tensor handle is NULL.");
             if( data )
             {
                 free( data );
@@ -1253,7 +1313,7 @@ void vsi_nn_SaveTensorToTextByFp32
         ptr += stride;
 
         count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count,
-            "%f%s", write_data, seperator );
+            "%.16f%s", write_data, seperator );
         if ( count > _TENSOR_TMPBUF_SZ )
         {
             VSILOGW( "tensor buffer overflow!" );
@@ -1497,16 +1557,24 @@ vsi_status vsi_nn_CopyDataToTensor
     if( tensor->attr.is_created_from_handle )
     {
         uint8_t* ptr = NULL;
-        vxSwapTensorHandle( tensor->t, NULL, (void **)&ptr);
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+        ptr = _get_tensor_handle((vsi_nn_tensor_prv_t*)tensor);
+#else
+        vxSwapTensorHandle(tensor->t, NULL, (void**)&ptr);
+#endif
         if ( ptr == NULL )
         {
-            VSILOGE("vxSwapTensorHandle fail.");
+            VSILOGE("Tensor handle is NULL.");
             return VSI_FAILURE;
         }
         memcpy( ptr, data, vsi_nn_GetTensorSize(tensor->attr.size, tensor->attr.dim_num,
                     tensor->attr.dtype.vx_type));
-        status = vxSwapTensorHandle( tensor->t, ptr, NULL );
-        status |= vxFlushHandle( (vx_reference)tensor->t );
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+        status = vxFlushHandle((vx_reference)tensor->t);
+#else
+        status = vxSwapTensorHandle(tensor->t, ptr, NULL);
+        status |= vxFlushHandle((vx_reference)tensor->t);
+#endif
     }
     else
     {
@@ -1550,6 +1618,25 @@ vsi_status vsi_nn_FlushHandle
     }
 } /* vsi_nn_FlushHandle() */
 
+vsi_status vsi_nn_InvalidateHandle
+(
+    const vsi_nn_tensor_t* tensor
+)
+{
+    if (NULL == tensor || NULL == tensor->t)
+    {
+        return VSI_FAILURE;
+    }
+    else
+    {
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+        return vxInvalidateHandleVSI((vx_reference)tensor->t);
+#else
+        return VSI_SUCCESS;
+#endif
+    }
+} /* vsi_nn_FlushHandle() */
+
 vsi_status vsi_nn_GetTensorHandle
     (
     vsi_nn_tensor_t      * tensor,
@@ -1562,10 +1649,39 @@ vsi_status vsi_nn_GetTensorHandle
     }
     else
     {
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+        if (NULL != _get_tensor_handle((vsi_nn_tensor_prv_t*)tensor))
+        {
+            *ptr = _get_tensor_handle((vsi_nn_tensor_prv_t*)tensor);
+            return VSI_SUCCESS;
+        }
+        else
+        {
+            return VSI_FAILURE;
+        }
+#else
         return vxSwapTensorHandle(tensor->t, NULL, ptr);
+#endif
     }
 } /* vsi_nn_GetTensorHandle() */
 
+vsi_status vsi_nn_SetTensorIsScalar
+(
+    vsi_nn_tensor_t* tensor,
+    int8_t is_scalar
+)
+{
+    return _set_tensor_is_scalar((vsi_nn_tensor_prv_t*)tensor, is_scalar);
+}
+
+int8_t vsi_nn_GetTensorIsScalar
+(
+    vsi_nn_tensor_t* tensor
+)
+{
+    return _get_tensor_is_scalar((vsi_nn_tensor_prv_t*)tensor);
+}
+
 vsi_status vsi_nn_CopyRawDataToTensor
     (
     vsi_nn_graph_t*         graph,
@@ -2240,6 +2356,18 @@ vsi_status vsi_nn_SwapTensorHandle
     status = vxSwapTensor( tensor0->t, tensor1->t );
     if( VX_SUCCESS == status )
     {
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+        uint8_t* temp_handle = NULL;
+        vsi_bool temp_is_handle_malloc_by_ovxlib = TRUE;
+
+        temp_handle = _get_tensor_handle((vsi_nn_tensor_prv_t*)tensor0);
+        _set_tensor_handle((vsi_nn_tensor_prv_t*)tensor0, _get_tensor_handle((vsi_nn_tensor_prv_t*)tensor1));
+        _set_tensor_handle((vsi_nn_tensor_prv_t*)tensor1, temp_handle);
+
+        temp_is_handle_malloc_by_ovxlib = tensor0->attr.is_handle_malloc_by_ovxlib;
+        tensor0->attr.is_handle_malloc_by_ovxlib = tensor1->attr.is_handle_malloc_by_ovxlib;
+        tensor1->attr.is_handle_malloc_by_ovxlib = temp_is_handle_malloc_by_ovxlib;
+#endif
         tensor0->is_swapped = TRUE;
         tensor1->is_swapped = TRUE;
     }
@@ -2671,17 +2799,26 @@ final:
 
 vsi_status vsi_nn_SwapHandle
     (
-    vsi_nn_tensor_t * tensor,
-    void * new_ptr,
-    void ** old_ptr
+    vsi_nn_tensor_t* tensor,
+    void* new_ptr,
+    vsi_bool is_new_ptr_malloc_by_ovxlib,
+    void** old_ptr
     )
 {
-    if(!tensor)
+    vsi_status status = VSI_FAILURE;
+    if (!tensor)
     {
         return VSI_FAILURE;
     }
-    vxSwapTensorHandle(tensor->t, new_ptr, old_ptr);
-    return VSI_SUCCESS;
+    status = vxSwapTensorHandle(tensor->t, new_ptr, old_ptr);
+#ifdef VSI_INVALIDATE_HANDLE_SUPPORT
+    if (VSI_SUCCESS == status)
+    {
+        _set_tensor_handle((vsi_nn_tensor_prv_t*)tensor, new_ptr);
+        tensor->attr.is_handle_malloc_by_ovxlib = is_new_ptr_malloc_by_ovxlib;
+    }
+#endif
+    return status;
 } /* vsi_nn_SwapHandle() */
 
 vsi_bool vsi_nn_ConvertTensor
@@ -2759,13 +2896,19 @@ vsi_nn_tensor_t * vsi_nn_dropout_tensor
     vsi_size_t size = 0;
     vsi_size_t i = 0;
     float* data   = NULL;
+    vsi_nn_tensor_attr_t attr;
 
     if (NULL == input || NULL == graph)
     {
         return NULL;
     }
 
-    output = vsi_nn_CreateTensor(graph, &input->attr);
+    memset(&attr, 0, sizeof(attr));
+    memcpy(&attr, &input->attr, sizeof(attr));
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+
+    output = vsi_nn_CreateTensor(graph, &attr);
     if ( !output )
     {
         VSILOGE("create tensor failed.");
@@ -2785,10 +2928,272 @@ vsi_nn_tensor_t * vsi_nn_dropout_tensor
         data[i] = data[i] * rate;
     }
 
-    vsi_nn_CopyRawDataToTensor( graph, (uint8_t *)data, &input->attr.dtype, output );
+    vsi_nn_CopyRawDataToTensor( graph, (uint8_t *)data, &attr.dtype, output );
 
 final:
     vsi_nn_safe_free(data);
 
     return output;
 }
+
+uint8_t* _get_tensor_handle
+    (
+    vsi_nn_tensor_prv_t* tensor
+    )
+{
+    uint8_t* handle = NULL;
+    if (NULL == tensor)
+    {
+        goto final;
+    }
+    handle = tensor->handle;
+
+final:
+    return handle;
+}
+
+vsi_status _set_tensor_handle
+    (
+    vsi_nn_tensor_prv_t* tensor,
+    uint8_t*             handle
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    if (NULL == tensor)
+    {
+        status = VSI_FAILURE;
+        goto final;
+    }
+    tensor->handle = handle;
+
+final:
+    return status;
+}
+
+int8_t _get_tensor_is_scalar
+(
+    vsi_nn_tensor_prv_t* tensor
+)
+{
+    int8_t is_scalar = FALSE;
+    if (NULL == tensor)
+    {
+        VSILOGE("To get is_scalar, tensor pointer SHOULD NOT be NULL.");
+        goto final;
+    }
+    is_scalar = tensor->is_scalar;
+
+    final:
+    return is_scalar;
+}
+
+vsi_status _set_tensor_is_scalar
+(
+    vsi_nn_tensor_prv_t* tensor,
+    int8_t is_salar
+)
+{
+    vsi_status status = VSI_SUCCESS;
+    if (NULL == tensor)
+    {
+        status = VSI_FAILURE;
+        goto final;
+    }
+    tensor->is_scalar = is_salar;
+
+    final:
+    return status;
+}
+
+static vsi_bool _init_dummy_tensor
+    (
+    vsi_nn_graph_t  * graph,
+    vsi_nn_tensor_t * tensor
+    )
+{
+    vsi_bool ret;
+    vx_tensor_create_params_t params;
+    float * scales = NULL;
+    int32_t * zeroPoints = NULL;
+    int32_t * null_zp = NULL;
+    vx_size size_vxsize[VSI_NN_MAX_DIM_NUM] = {0};
+    vx_uint32 size_u32[VSI_NN_MAX_DIM_NUM] = {0};
+    size_t i = 0;
+    ret = TRUE;
+
+    memset( &params, 0, sizeof( vx_tensor_create_params_t ) );
+    params.num_of_dims = tensor->attr.dim_num;
+    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    {
+        size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i];
+    }
+    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    {
+        size_u32[i] = -1 == tensor->attr.size[i] ? -1 : (vx_uint32)tensor->attr.size[i];
+    }
+#ifdef VSI_40BIT_VA_SUPPORT
+    params.sizes = size_vxsize;
+    (void)size_u32;
+#else
+    params.sizes = size_u32;
+    (void)size_vxsize;
+#endif
+    params.data_format = (vsi_enum)tensor->attr.dtype.vx_type;
+    switch( tensor->attr.dtype.qnt_type )
+    {
+    case VSI_NN_QNT_TYPE_DFP:
+        params.quant_format = (vsi_enum)VX_QUANT_DYNAMIC_FIXED_POINT;
+        params.quant_data.dfp.fixed_point_pos = (uint8_t)tensor->attr.dtype.fl;
+        break;
+    case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
+    case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+        params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE;
+        params.quant_data.affine.scale = tensor->attr.dtype.scale;
+        params.quant_data.affine.zeroPoint = (int32_t)tensor->attr.dtype.zero_point;
+        break;
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
+#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
+        #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL
+            params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL;
+        #else
+            params.quant_format = (vsi_enum)VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC;
+        #endif
+        // This is a hack that driver doesn't support const scales
+        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
+        memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float));
+        params.quant_data.affinePerChannel.channelDim = tensor->attr.dtype.channel_dim;
+        params.quant_data.affinePerChannel.scaleCount = tensor->attr.dtype.scale_dim;
+        params.quant_data.affinePerChannel.scales = scales;
+        params.quant_data.affinePerChannel.zeroPoint = NULL;
+        params.quant_data.affinePerChannel.zeroPointCount = 0;
+        {
+            // Low-level driver only support asymmetric. Application doesn't provide zp information if
+            // it's symmetric quantized tensor. Fake a zp information filled with zero to meet low-level's
+            // requirement
+            null_zp = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.scale_dim);
+            memset(null_zp, 0, sizeof(int32_t) * tensor->attr.dtype.scale_dim);
+            params.quant_data.affinePerChannel.zeroPoint = null_zp;
+            params.quant_data.affinePerChannel.zeroPointCount= tensor->attr.dtype.scale_dim;
+        }
+        break;
+#else
+    VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC." );
+#endif
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC:
+#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
+        #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL
+            params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL;
+        #else
+            params.quant_format = (vsi_enum)VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC;
+        #endif
+        // This is a hack that driver doesn't support const scales
+        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
+        memcpy(scales,
+               tensor->attr.dtype.scales,
+               tensor->attr.dtype.scale_dim * sizeof(float));
+        zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
+        memcpy(zeroPoints,
+               tensor->attr.dtype.zero_points,
+               tensor->attr.dtype.zero_points_dim * sizeof(int32_t));
+        params.quant_data.affinePerChannel.channelDim =
+            tensor->attr.dtype.channel_dim;
+        params.quant_data.affinePerChannel.scaleCount =
+            tensor->attr.dtype.scale_dim;
+        params.quant_data.affinePerChannel.scales = scales;
+        params.quant_data.affinePerChannel.zeroPoint = zeroPoints;
+        params.quant_data.affinePerChannel.zeroPointCount = tensor->attr.dtype.zero_points_dim;
+        break;
+#else
+        VSILOGE(
+            "can't support qnt_type "
+            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC.");
+#endif
+    default:
+        break;
+    }
+
+    if( NULL != tensor->t )
+    {
+        vxReleaseTensor( &tensor->t );
+    }
+    if( NULL != tensor->wb )
+    {
+        vxReleaseWeightsBiasesParameter( &tensor->wb );
+    }
+
+#if (VX_STREAM_PROCESSOR_SUPPORT)
+    tensor->t = vxCreateDummyTensor( graph->ctx->c,
+        (vsi_size_t)tensor->attr.dim_num, size_vxsize, (vsi_enum)tensor->attr.dtype.vx_type );
+#else
+    tensor->t = NULL;
+#endif
+    if ( NULL == tensor->t )
+    {
+        VSILOGE( "Create vx tensor fail." );
+        ret = FALSE;
+        goto final;
+    }
+
+final:
+    if( scales )
+    {
+        free(scales);
+    }
+    if (zeroPoints)
+    {
+        free(zeroPoints);
+    }
+    if(null_zp)
+    {
+        free(null_zp);
+        null_zp = NULL;
+    }
+    return ret;
+} /* _init_dummy_tensor() */
+
+static vsi_nn_tensor_t * _create_dummy_tensor
+    (
+    vsi_nn_graph_t       * graph,
+    vsi_nn_tensor_attr_t * attr
+    )
+{
+    vsi_nn_tensor_prv_t * tensor;
+
+    tensor = NULL;
+    if( NULL == graph || NULL == graph->g || NULL == attr )
+    {
+        return NULL;
+    }
+
+    tensor = (vsi_nn_tensor_prv_t *)malloc( sizeof( vsi_nn_tensor_prv_t ) );
+    //vsi_nn_UpdateTensorDims( attr );
+
+    if ( NULL != tensor )
+    {
+        memset( tensor, 0, sizeof( vsi_nn_tensor_prv_t ) );
+        memcpy( &tensor->pot.attr, attr, sizeof( vsi_nn_tensor_attr_t ) );
+        tensor->pot.is_swapped = FALSE;
+        if( attr->dim_num != VSI_NN_DIM_AUTO )
+        {
+            _init_dummy_tensor( graph, &tensor->pot);
+            if( NULL == tensor->pot.t )
+            {
+                VSILOGE( "Create vx tensor fail." );
+                free( tensor );
+                tensor = NULL;
+            }
+        }
+    }
+
+    return (vsi_nn_tensor_t*)tensor;
+}
+
+vsi_nn_tensor_t * vsi_nn_create_dummy_tensor
+    (
+    vsi_nn_graph_t       * graph,
+    vsi_nn_tensor_attr_t * attr
+    )
+{
+    attr->is_created_from_handle = FALSE;
+    return _create_dummy_tensor(graph, attr);
+} /* vsi_nn_create_dummy_tensor() */
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
new file mode 100644
index 0000000..d46138f
--- /dev/null
+++ b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
@@ -0,0 +1,104 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+/** @file */
+#ifndef _VSI_NN_TENSOR_UTIL_PRV_H
+#define _VSI_NN_TENSOR_UTIL_PRV_H
+
+/*-------------------------------------------
+                Includes
+-------------------------------------------*/
+#include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define vsi_safe_release_node(_n) if(_n){vxReleaseNode( ((vx_node*)&_n) ); _n = NULL;}
+
+#define VSI_NN_SUPPORT_LSTM_GRU_SP_IMPL      (1)
+/**
+ * Get tensor handle
+ *
+ * @param[in] tensor, a pointer pointed to vsi_nn_tensor_prv_t.
+ *
+ * @return handle of tensor on success, or NULL otherwise.
+ */
+uint8_t* _get_tensor_handle
+    (
+    vsi_nn_tensor_prv_t* tensor
+    );
+
+/**
+ * Set tensor handle
+ *
+ * @param[in] tensor, a pointer pointed to vsi_nn_tensor_prv_t.
+ * @param[in] handle, a handle need to be set to tensor.
+ *
+ * @return VSI_SUCCESS on success, or VSI_FAILURE otherwise.
+ */
+vsi_status _set_tensor_handle
+    (
+    vsi_nn_tensor_prv_t* tensor,
+    uint8_t*             handle
+    );
+
+int8_t _get_tensor_is_scalar
+    (
+    vsi_nn_tensor_prv_t* tensor
+    );
+
+vsi_status _set_tensor_is_scalar
+    (
+    vsi_nn_tensor_prv_t* tensor,
+    int8_t is_salar
+    );
+
+/**
+ * Create a new dummy tensor
+ * Create a new dummy tensor with given attributes.
+ *
+ * @param[in] graph Graph handle
+ * @param[in] attr Tensor attributes
+ *
+ * @return Tensor handle on success, or NULL otherwise.
+ */
+vsi_nn_tensor_t * vsi_nn_create_dummy_tensor
+    (
+    vsi_nn_graph_t       * graph,
+    vsi_nn_tensor_attr_t * attr
+    );
+
+vsi_bool vsi_nn_is_stream_process_supported_types
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t** inputs,
+    size_t input_num
+    );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/src/vsi_nn_types_prv.h b/src/tim/vx/internal/src/vsi_nn_types_prv.h
new file mode 100644
index 0000000..a2c2b56
--- /dev/null
+++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h
@@ -0,0 +1,77 @@
+/****************************************************************************
+*
+*    Copyright (c) 2022 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+/** @file */
+#ifndef _VSI_NN_TYPES_PRV_H_
+#define _VSI_NN_TYPES_PRV_H_
+
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_tensor.h"
+
+#if defined(__cplusplus)
+extern "C"{
+#endif
+
+/**
+ * Internal Graph structure, internal use only.
+ */
+typedef struct _vsi_nn_graph_prv
+{
+    /** Public Ovxlib Graph(pot)*/
+    vsi_nn_graph_t pog;
+
+    // Add graph internal attribute here...
+} vsi_nn_graph_prv_t;
+
+/** Internal Node structure, internal use only. */
+typedef struct _vsi_nn_node_prv
+{
+    /** Public Ovxlib Node(pon)*/
+    vsi_nn_node_t pon;
+
+    // Add node internal attribute here...
+} vsi_nn_node_prv_t;
+
+/**
+    * Internal Tensor structure, internal use only.
+    */
+typedef struct _vsi_nn_tensor_prv
+{
+    /** Public Ovxlib Tensor(pot)*/
+    vsi_nn_tensor_t pot;
+
+    /** Tensor handle*/
+    uint8_t* handle;
+
+    /** is scalar*/
+    int8_t is_scalar;
+
+    // Add tensor internal attribute here...
+} vsi_nn_tensor_prv_t;
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/tim_internal.cmake b/src/tim/vx/internal/tim_internal.cmake
index 6f0809c..2fcfaea 100644
--- a/src/tim/vx/internal/tim_internal.cmake
+++ b/src/tim/vx/internal/tim_internal.cmake
@@ -17,7 +17,7 @@ if(USE_VXC_BINARY)
     if(NOT GPU_CONFIG_FILE)
         message(FATAL_ERROR "Need set GPU_CONFIG_FILE for vxc binary")
     endif()
-    
+
     execute_process(COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/vx/internal/ovxlib_bin_build.sh
         ${VIV_SDK_PATH} ${VCCOMPILER_PATH}
         ${CMAKE_CURRENT_SOURCE_DIR}/vx/internal/ ${GPU_CONFIG_FILE})
@@ -28,7 +28,6 @@ endif()
 aux_source_directory(./vx/internal/src INTERNAL_SRC)
 aux_source_directory(./vx/internal/src/kernel INTERNAL_KERNEL)
 aux_source_directory(./vx/internal/src/kernel/cl INTERNAL_KERNEL_CL)
-aux_source_directory(./vx/internal/src/kernel/cpu INTERNAL_KERNEL_CPU)
 aux_source_directory(./vx/internal/src/kernel/evis INTERNAL_KERNEL_EVIS)
 aux_source_directory(./vx/internal/src/kernel/vx INTERNAL_KERNEL_VX)
 aux_source_directory(./vx/internal/src/ops INTERNAL_OPS)