Update prebuilt && internal for 23Q2 release (#617)

* Update prebuilt-sdk to 6.4.15 release Type: Code Improvement Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com> * Update internal to 1.1.84 rel Update internal to SHA 1e591108dddcbf6dd88d5eef97a7d8b3ffc19ce3 Type: Code Improvement Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com> --------- Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2023-07-08 23:38:17 +08:00 · 2023-07-08 23:38:17 +08:00 · 32c5a61601
parent 02d6d72946
commit 32c5a61601
475 changed files with 26346 additions and 4350 deletions
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@ -1 +1 @@
-6.4.14_CL650117A_D650117_A648302_R647402_T648811_O646970
+6.4.15_CL690884A_D690855_A690484_R690194_T690259_O688896
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_api.h
@ -1340,6 +1340,21 @@ VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeCallback(vx_node node, vx_nodecom
 */
 VX_API_ENTRY vx_nodecomplete_f VX_API_CALL vxRetrieveNodeCallback(vx_node node);
 /*! \brief Assigns a callback to a node.
 * If a callback already exists in this node, this function must return an error
 * and the user may clear the callback by passing a NULL pointer as the callback.
 * \param [in] node The reference to the node.
 * \param [in] callback The callback to associate with completion of this
 * specific node.
 * \warning This must be used with <b><i>extreme</i></b> caution as it can \e ruin
 * optimizations in the power/performance efficiency of a graph.
 * \return A <tt>\ref vx_status_e</tt> enumeration.
 * \retval VX_SUCCESS Callback assigned; any other value indicates failure.
 * \retval VX_ERROR_INVALID_REFERENCE node is not a valid <tt>\ref vx_node</tt> reference.
 * \ingroup group_node_callback
 */
 VX_API_ENTRY vx_status VX_API_CALL vxAssignNodeQueryCallback(vx_node node, vx_nodequery_f callback);
 /*! \brief Sets the node target to the provided value. A success invalidates the graph
 * that the node belongs to (<tt>\ref vxVerifyGraph</tt> must be called before the next execution)
 * \param [in] node  The reference to the <tt>\ref vx_node</tt> object.
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@ -503,6 +503,40 @@ enum vx_kernel_e {
    VX_KERNEL_NN_BATCH_GEMM_RELU_POOLING_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x33,
    VX_KERNEL_NN_FUSED_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x34,
    VX_KERNEL_NN_CONVOLUTION_RELU_POOLING_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x35,
    VX_KERNEL_NN_LAYER_NORMALIZATION_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x36,
    VX_KERNEL_NN_INSTANCE_NORMALIZATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x37,
    VX_KERNEL_NN_GROUP_NORMALIZATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x38,
    VX_KERNEL_NN_LOGICAL_OPS_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x39,
    VX_KERNEL_NN_LOGICAL_NOT_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x40,
    VX_KERNEL_NN_RELATIONAL_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x41,
    VX_KERNEL_NN_TENSOR_REDUCE_MAX = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x42,
    VX_KERNEL_NN_MAXIMUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x43,
    VX_KERNEL_NN_MINIMUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x44,
    VX_KERNEL_NN_TENSOR_SELECT_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x45,
    VX_KERNEL_NN_REDUCE_SUM_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x46,
    VX_KERNEL_NN_GRU_CELL_ACTIVATION_Z_H_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x47,
    VX_KERNEL_NN_GRU_CELL_H_TIMES_ACTIVATION_R_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x48,
    VX_KERNEL_NN_GRU_CELL_RESET_AFTER_ACTIVATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x49,
    VX_KERNEL_NN_LSTM_ACTIVATION_SP_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x50,
    VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@ -214,7 +214,7 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor
 1: support
 */
 #ifndef VX_STREAM_PROCESSOR_SUPPORT
-#define VX_STREAM_PROCESSOR_SUPPORT 0
+#define VX_STREAM_PROCESSOR_SUPPORT 1
 #endif
 /*
@ -258,5 +258,144 @@ VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can suppor
 #define VX_ACTIVATION_EXT2_SUPPORT 1
 #endif
 /*
 VX_TENSORVIEW_ON_ANY_DIM is used to declare that ovxlib can do optimization for all concat node(all dimision) to tensor view if possiable, not only channel.
 [value]
 0: disable
 1: enable
 */
 #ifndef VX_TENSORVIEW_ON_ANY_DIM
 #define VX_TENSORVIEW_ON_ANY_DIM 0
 #endif
 /*
 VX_DEPTH2SPACE_CRD_MODE_SUPPORT is used to declare that SPACE2DEPTH can support CRD mode
 [value]
 0: not support
 1: support
 */
 #ifndef VX_DEPTH2SPACE_CRD_MODE_SUPPORT
 #define VX_DEPTH2SPACE_CRD_MODE_SUPPORT 1
 #endif
 /*
 VX_LAYER_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_LAYER_NORMALIZATION_VX_SUPPORT
 #define VX_LAYER_NORMALIZATION_VX_SUPPORT 1
 #endif
 /*
 VX_LAYER_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_INSTANCE_NORMALIZATION_VX_SUPPORT
 #define VX_INSTANCE_NORMALIZATION_VX_SUPPORT 1
 #endif
 /*
 VX_GROUP_NORMALIZATION_VX_SUPPORT is used to declare driver support layer normalization layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_GROUP_NORMALIZATION_VX_SUPPORT
 #define VX_GROUP_NORMALIZATION_VX_SUPPORT 1
 #endif
 /*
 VX_LOGICAL_VX_SUPPORT is used to declare driver support layer logical related layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_LOGICAL_VX_SUPPORT
 #define VX_LOGICAL_VX_SUPPORT 1
 #endif
 /*
 VX_RELATIONAL_OPS_VX_SUPPORT is used to declare driver support layer relational related layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_RELATIONAL_OPS_VX_SUPPORT
 #define VX_RELATIONAL_OPS_VX_SUPPORT 1
 #endif
 /*
 VX_REDUCE_MAX_VX_SUPPORT is used to declare driver support layer reduce max layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_REDUCE_MAX_VX_SUPPORT
 #define VX_REDUCE_MAX_VX_SUPPORT 1
 #endif
 /*
 VX_REDUCE_MEAN_VX_SUPPORT is used to declare driver support layer reduce mean layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_REDUCE_MEAN_VX_SUPPORT
 #define VX_REDUCE_MEAN_VX_SUPPORT 1
 #endif
 /*
 VX_REDUCE_SUM_VX_SUPPORT is used to declare driver support layer reduce sum layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_REDUCE_SUM_VX_SUPPORT
 #define VX_REDUCE_SUM_VX_SUPPORT 1
 #endif
 /*
 VX_MAX_MIN_IMUM_VX_SUPPORT is used to declare driver support maximum and minimum layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_MAX_MIN_IMUM_VX_SUPPORT
 #define VX_MAX_MIN_IMUM_VX_SUPPORT 1
 #endif
 /*
 VX_TENSOR_SELECR_VX_SUPPORT is used to declare driver support tensor select layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_TENSOR_SELECT_VX_SUPPORT
 #define VX_TENSOR_SELECT_VX_SUPPORT 1
 #endif
 /*
 VX_GRU_CELL_VX_SUPPORT is used to declare driver support gru cell layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_GRU_CELL_VX_SUPPORT
 #define VX_GRU_CELL_VX_SUPPORT 1
 #endif
 /*
 VX_LSTM_ACTIVATION_SUPPORT is used to declare driver support gru cell layer.
 [value]
 0: not support
 1: support
 */
 #ifndef VX_LSTM_ACTIVATION_SUPPORT
 #define VX_LSTM_ACTIVATION_SUPPORT 1
 #endif
 #endif /* __VX_KHR_COMPATIBLE_H__ */
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@ -395,6 +395,17 @@ enum vx_tensor_lifetime_type_e
    VX_TENSOR_LIFE_TIME_DYNAMIC,
 };
 /*! \brief Specifies depthtospace mode
 * \ingroup group_cnn
 */
 enum vx_nn_depth_to_space_mode_e
 {
    /*! \brief DCR(default) for depth-column-row order re-arrangement */
    VX_NN_DEPTH_TO_SPACE_DCR = 0x0,
    /*! \brief CRD for column-row-depth order re-arrangement */
    VX_NN_DEPTH_TO_SPACE_CRD,
 };
 typedef struct _vx_nn_convolution_3d_params_t
 {
    vx_int32 padding_w_left;                 /*!< \brief Number of elements added at each side in the left of w dimension of the input. */
@ -972,6 +983,16 @@ typedef struct _vx_nn_mean_params_t
    vx_int32 keep_dims;        /*!< \brief Keep dims, if positive, retains reduced dims with length 1 */
 } vx_nn_mean_params_t;
 /*! \brief Input parameter for reducesum layer
 * \ingroup group_cnn
 *\version 0.5
 */
 typedef struct _vx_nn_sum_params_t
 {
    vx_tensor axis;            /*!< \brief 1D axis tensor of reduce dims </tt> */
    vx_int32 keep_dims;        /*!< \brief Keep dims, if positive, retains reduced dims with length 1 */
 } vx_nn_sum_params_t;
 /*! \brief Input parameter for tensor squeeze layer
 * \ingroup group_cnn
 *\version 0.5
@ -1254,6 +1275,12 @@ typedef struct _vx_nn_reorg_params_ext2_t
    vx_int32 *axis;
 } vx_nn_reorg_params_ext2_t;
 typedef struct _vx_nn_reorg_params_ext3_t
 {
    vx_nn_reorg_params_ext2_t base;      /*!< \brief vx_nn_reorg_params <tt>\ref vx_nn_reorg_params_t</tt> */
    vx_enum mode;                        /*!< \brief  [Optional] Only for DEPH2SPACE */
 } vx_nn_reorg_params_ext3_t;
 /*! \brief [Graph] Creates a Reorgnization Layer Node, Enhancement of vxReorgLayer, Support both DEPTH to SPACE and SPACE to DEPTH.
 * \param [in] graph The reference to the parent graph.
 * \param [in] input The input tensor data to reorg.
@ -1911,6 +1938,21 @@ VX_API_ENTRY vx_node VX_API_CALL vxRPNLayer(
    vx_tensor                   score_output
    );
 /*! \brief Input parameters for a lstm activation operation.
 * \ingroup group_cnn
 * \version 0.3
 */
 typedef struct _vx_nn_lstm_activation_params_t
 {
    vx_int32 is_ln;
    vx_int32 is_cifg;
    vx_int32 is_proj;
    vx_int32 is_hybrid;
    vx_int32 is_peephole;
    vx_int32 recurrent_activation;
    vx_float32 forget_bias;
 } vx_nn_lstm_activation_params_t;
 /*! \brief Input parameters for a lstm operation.
 * \ingroup group_cnn
 * \version 0.3
@ -2115,6 +2157,28 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorMeanNode(
    vx_size size_of_mean_param,
    vx_tensor outputs);
 /*! \brief [Graph] Creates sum layer node.
 * \details
 *    Computes the sum of elements across dimensions of a tensor.
 *
 * \param [in] graph The handle to the graph.
 * \param [in] input A n-D tensor, specifying the input.
 * \param [in] sum_params paraments <tt>\ref vx_nn_sum_params_t </tt>.
 * \param [in] size_of_sum_param [static] The size of the vx_nn_mean_params_t.
 * \param [out] output A n-D tensor of the same type as input.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_tensor
 * \version 0.5
 */
 VX_API_ENTRY vx_node VX_API_CALL vxReduceSumNode(
    vx_graph graph,
    vx_tensor inputs,
    const vx_nn_sum_params_t *sum_params,
    vx_size size_of_sum_param,
    vx_tensor outputs);
 /*! \brief [Graph] Creates squeeze layer node.
 * \details
 *    Remove dimensions of size 1 from the input tensor.
@ -2287,6 +2351,282 @@ VX_API_ENTRY vx_node VX_API_CALL vxConv3dLayer(vx_graph graph, vx_tensor inputs,
 */
 VX_API_ENTRY vx_node VX_API_CALL vxDeconv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_deconvolution_3d_params_t *convolution_params, vx_size size_of_deconv_params, vx_tensor outputs);
 /*! \brief [Graph] Creates a layer Normalization Node.
 * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.
 * \param [in] graph The handle to the graph.
 * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5)
 * \param [in] axis [static] The axis on which we need do normalize.
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [out] output [static] The output tensor data.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxLayerNormalizationLayer(
    vx_graph                    graph,
    vx_float32                  eps,
    vx_int32                    axis,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_tensor                   output
    );
 /*! \brief [Graph] Creates a layer instance normalization Node.
 * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.
 * \param [in] graph The handle to the graph.
 * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5)
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [out] output [static] The output tensor data.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxInstanceNormalizationLayer(
    vx_graph                    graph,
    vx_float32                  eps,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_tensor                   output
    );
 /*! \brief [Graph] Creates a layer instance normalization Node.
 * \details Normalize the activations of the previous layer at each batch, i.e. applies a transformation that maintains the mean activation close to 0 and the activation standard deviation close to 1.
 * \param [in] graph The handle to the graph.
 * \param [in] eps [static] Float 32. Small value to add to the variance estimate so that we don't divide by zero.(default is 1e-5)
 * \param [in] group_num  [static] Int 32. Number of groups for GN
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [out] output [static] The output tensor data.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxGroupNormalizationLayer(
    vx_graph                    graph,
    vx_float32                  eps,
    vx_int32                    group_num,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_tensor                   output
    );
 /*! \brief [Graph] Creates a layer logical ops Node.
 * \details Return the truth value of x AND, XOR,OR y element-wise.
 * \param [in] graph The handle to the graph.
 * \param [in] ops_type  [static] Int 32. Operation Type
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [out] output [static] The output tensor data.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxLogicalOpsLayer(
    vx_graph                    graph,
    vx_int32                    ops_type,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_tensor                   output
    );
 /*! \brief [Graph] Creates a layer logical not Node.
 * \details Return the truth value of not x element-wise.
 * \param [in] graph The handle to the graph.
 * \param [in] input [static] The input tensor data.
 * \param [out] output [static] The output tensor data.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxLogicalNotLayer(
    vx_graph                    graph,
    vx_tensor                   input,
    vx_tensor                   output
    );
 /*! \brief [Graph] Creates a layer relational Node.
 * \param [in] graph The handle to the graph.
 * \param [in] ops_type  [static] Int 32. Operation Type
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [out] output [static] The output tensor data.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxRelationalLayer(
    vx_graph                    graph,
    vx_int32                    ops_type,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_tensor                   output
    );
 /*! \brief [Graph] Computes the max of elements across dimensions of input tensor.
 * \param [in] graph The handle to the graph.
 * \param [in] in input tensor data,
 * \param [in] axis [static] used to determine max across which dimension(dimension 0 means width, etc). If not given, compute the sum across all dimensions.
 * \param [in] keep_dim [static] means if keep the dimesion count.
 * \param [out] out output tensor data.
 * \ingroup group_tensor
 * \return <tt> vx_node</tt>.
 * \retval 0 Node could not be created.
 * \retval * Node handle.
 * \version 0.3
 */
 VX_API_ENTRY vx_node VX_API_CALL vxTensorReduceMaxNode(
    vx_graph graph,
    vx_tensor inputs,
    vx_tensor axis,
    vx_bool keep_dims,
    vx_tensor outputs);
 /*! \brief [Graph] Creates a layer minumum Node.
 * \param [in] graph The handle to the graph.
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [out] output [static] The output tensor data.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxMinimumLayer(
    vx_graph                    graph,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_tensor                   output
    );
 /*! \brief [Graph] Creates a layer maximum Node.
 * \param [in] graph The handle to the graph.
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [out] output [static] The output tensor data.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxMaximumLayer(
    vx_graph                    graph,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_tensor                   output
    );
 /*! \brief [Graph] Creates a layer select Node.
 * \param [in] graph The handle to the graph.
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [out] output [static] The output tensor data.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxTensorSelectLayer(
    vx_graph                    graph,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_tensor                   output
    );
 /*! \brief [Graph] Creates a layer gru cell activation z h Node.
 * \param [in] graph The handle to the graph.
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [in] recurrent_activation [static] recurrent activation type.
 * \param [in] activation [static] activation type.
 * \param [out] output_list [static] The output tensor data.
 * \param [out] output_count [static] The output tensor number.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxGruCellActivationZHLayer(
    vx_graph                    graph,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_int32                    recurrent_activation,
    vx_int32                    activation,
    vx_tensor*                  output_list,
    vx_uint32                   output_count
    );
 /*! \brief [Graph] Creates a layer gru cell h times activation r Node.
 * \param [in] graph The handle to the graph.
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [in] recurrent_activation [static] recurrent activation type.
 * \param [out] output_list [static] The output tensor data.
 * \param [out] output_count [static] The output tensor number.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxGruCellHTimeActivationRLayer(
    vx_graph                    graph,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_int32                    recurrent_activation,
    vx_tensor*                  output_list,
    vx_uint32                   output_count
    );
 /*! \brief [Graph] Creates a layer gru cell reset after activationNode.
 * \param [in] graph The handle to the graph.
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [in] recurrent_activation [static] recurrent activation type.
 * \param [in] activation [static] activation type.
 * \param [out] output_list [static] The output tensor data.
 * \param [out] output_count [static] The output tensor number.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxGruCellResetAfterActivationLayer(
    vx_graph                    graph,
    vx_tensor*                  input_list,
    vx_uint32                   input_count,
    vx_int32                    recurrent_activation,
    vx_int32                    activation,
    vx_tensor*                  output_list,
    vx_uint32                   output_count
    );
 /*! \brief [Graph] Creates a layer lstm activation Node.
 * \param [in] graph The handle to the graph.
 * \param [in] input_list [static] The input tensor data.
 * \param [in] input_count [static] The input tensor number.
 * \param [in] lstm_activation_param <tt>\ref vx_nn_lstm_activation_params_t </tt>.
 * \param [out] output_list [static] The output tensor data.
 * \param [out] output_count [static] The output tensor number.
 * \return <tt> vx_node</tt>.
 * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
 * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_cnn
 */
 VX_API_ENTRY vx_node VX_API_CALL vxLSTMActivationLayer(
    vx_graph                                     graph,
    vx_tensor*                                   input_list,
    vx_uint32                                    input_count,
    const vx_nn_lstm_activation_params_t *       lstm_activation_param,
    vx_tensor*                                   output_list,
    vx_uint32                                    output_count
    );
 #ifdef  __cplusplus
 }
 #endif
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@ -242,6 +242,48 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext7_t
    vx_bool       isSub;
 } vx_nn_convolution_relu_pooling_params_ext7_t, * vx_nn_convolution_relu_pooling_params_ext7;
 typedef struct _vx_nn_fused_sp_params_t
 {
    vx_enum multi_sp_kernel_type;
    /*!<for mul>*/
    vx_scalar mul_scale;
     /*!<for sp>*/
    union
    {
        struct
        {
            vx_scalar linear_a, linear_b;
        } linear;
        struct
        {
            vx_scalar tanh_a, tanh_b;
            float a_v, b_v;
        } tanh_linear;
        struct
        {
            vx_scalar hsigmoid_a, hsigmoid_b;
        } hsigmoid;
        struct
        {
            vx_scalar clip_a, clip_b;
        } clip;
        struct
        {
            vx_scalar scalar_a, scalar_b, scalar_c, scalar_d;
        } params;
    } scalar_params;
      /*!<for other kernel>*/
 } vx_nn_fused_sp_params_t, * vx_nn_fused_sp_params;
 typedef struct _vx_nn_convolution_relu_pooling_params_sp_ext_t
 {
    vx_nn_convolution_relu_pooling_params_ext4_t ext4;  /*!< \brief convolution relu pooling params <tt>\ref vx_nn_convolution_relu_pooling_params_ext_t</tt> */
    vx_object_array inputs_list;
    vx_object_array outputs_list;
    vx_nn_fused_sp_params_t sp_param;
 } vx_nn_convolution_relu_pooling_params_sp_ext_t, * vx_nn_convolution_relu_pooling_params_sp_ext;
 /*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
 * \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer.
 *  For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
@ -1129,6 +1171,48 @@ VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmReluPoolingLayer(vx_graph graph,
                                                             const vx_nn_gemm_relu_pooling_params merge_param,
                                                             vx_tensor output);
 /*! \brief  Create a fuse stream process node.
 * \param [in] graph The handle to the graph.
 * \param [in] input_list input tensor list.
 * \param [in] input_count input tensor number.
 * \param [in] output_list output tensor list.
 * \param [in] output_count output tensor number.
 * \param [in] params the parameters for multi streamprocessor merging.
 * \return <tt>\ref vx_node</tt>.
 * \retval vx_node A node reference. Any possible errors preventing a successful creation
 * should be checked using <tt>\ref vxGetStatus</tt>
 * \ingroup group_vision_function_sp
 */
 VX_API_ENTRY vx_node VX_API_CALL vxFusedSpNode(
    vx_graph                                graph,
    vx_tensor*                              input_list,
    vx_uint32                               input_count,
    vx_tensor*                              output_list,
    vx_uint32                               output_count,
    const vx_nn_fused_sp_params_t *         params
    );
 /*! \brief  Create a conv fuse stream process node.
 * \param [in] graph The handle to the graph.
 * \param [in] inputs input tensor.
 * \param [in] weights_biases [static] Point to WeightBiasesParameter data, vx_weights_biases_parameter is an opaque reference. 
 * \param [in] convolution_relu_pooling_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_relu_pooling_params_t</tt>
 * \param [in] size_of_convolution_relu_pooling_params [static] Size in bytes of convolution_relu_pooling_params.
 * \param [in] outputs output tensor.
 * \return <tt>\ref vx_node</tt>.
 * \retval vx_node A node reference. Any possible errors preventing a successful creation
 * should be checked using <tt>\ref vxGetStatus</tt>
 * \ingroup group_vision_function_sp
 */
 VX_API_ENTRY vx_node VX_API_CALL vxConvSpNode(
    vx_graph                                        graph,
    vx_tensor                                       inputs,
    vx_weights_biases_parameter                     weights_biases,
    const vx_nn_convolution_relu_pooling_params_t * convolution_relu_pooling_params,
    vx_size                                         size_of_convolution_relu_pooling_params,
    vx_tensor                                       outputs
 );
 #ifdef  __cplusplus
 }
 #endif
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h
@ -345,16 +345,6 @@ VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST(
    vx_context          context
    );
 /*! \brief Creates an internal reference to a spinst data.
 * \param [in] context The reference to the implementation context.
 * \return A spinst data reference.
 * \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
 * \ingroup group_object_spinst
 */
 VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINSTInternal(
    vx_context          context
    );
 /*! \brief Releases a reference to a external spinst object.
 * The object may not be garbage collected until its total reference count is zero.
 * \param [in] spinst_obj The pointer to the spinst data to release.
@ -368,19 +358,6 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST(
    vx_spinst            *spinst_obj
    );
 /*! \brief Releases a reference to a internal spinst object.
 * The object may not be garbage collected until its total reference count is zero.
 * \param [in] spinst_obj The pointer to the spinst data to release.
 * \post After returning from this function the reference is zeroed.
 * \return A <tt>\ref vx_status_e</tt> enumeration.
 * \retval VX_SUCCESS No errors; all other values indicate failure
 * \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
 * \ingroup group_object_spinst
 */
 VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINSTInternal(
    vx_spinst            *spinst_obj
    );
 /*! \brief Add a instruction to spinst object.
 * \param [in] spinst_obj The reference to the spinst object.
 * \param [in] inst_unit_array The units of one instruction. Use a <tt>\ref vx_spinst_unit_param</tt>.
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@ -477,6 +477,8 @@ enum vx_type_e {
    VX_TYPE_SPINST          = 0x81B,/*!< \brief A <tt>\ref vx_spinst</tt>. */
    VX_TYPE_INT4            = 0x81C,/*!< \brief A <tt>\ref signed 4bits tensor.</tt>. */
    VX_TYPE_UINT4           = 0x81D,/*!< \brief A <tt>\ref unsigned 4bits tensor.</tt>. */
    VX_TYPE_FLOAT8_E4M3     = 0x81E,/*!< \brief A <tt>\ref vx_float8_e4m3</tt>. */
    VX_TYPE_FLOAT8_E5M2     = 0x81F,/*!< \brief A <tt>\ref vx_float8_e5m2</tt>. */
 };
 /*! \brief The enumeration of all status codes.
@ -803,6 +805,8 @@ enum vx_convert_policy_e {
    VX_CONVERT_POLICY_WRAP = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x0,
    /*! \brief Results are saturated to the bit depth of the output operand. */
    VX_CONVERT_POLICY_SATURATE = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_CONVERT_POLICY) + 0x1,
    /*! \brief Results preserve infinity and nan value. */
    VX_CONVERT_POLICY_INF = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_CONVERT_POLICY) + 0x0,
 };
 /*! \brief Based on the VX_DF_IMAGE definition.
--- a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
--- a/prebuilt-sdk/x86_64_linux/lib/libCLC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
--- a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
--- a/prebuilt-sdk/x86_64_linux/lib/libGAL.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
--- a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
--- a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
--- a/prebuilt-sdk/x86_64_linux/lib/libVSC.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
--- a/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so
+++ b/prebuilt-sdk/x86_64_linux/lib/libvdtproxy.so
--- a/src/tim/vx/internal/include/custom/custom_node_type.def
+++ b/src/tim/vx/internal/include/custom/custom_node_type.def
@ -6,3 +6,6 @@ DEF_NODE_TYPE(custom_ainr_denoise_postprocess)
 DEF_NODE_TYPE(custom_warp_affine)
 DEF_NODE_TYPE(custom_warp_perspective)
 DEF_NODE_TYPE(custom_sample)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_confidence)
 DEF_NODE_TYPE(custom_tiny_yolov4_postprocess_box)
--- a/src/tim/vx/internal/include/custom/custom_ops.def
+++ b/src/tim/vx/internal/include/custom/custom_ops.def
@ -6,3 +6,6 @@ DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS)
 DEF_OP(CUSTOM_WARP_AFFINE)
 DEF_OP(CUSTOM_WARP_PERSPECTIVE)
 DEF_OP(CUSTOM_SAMPLE)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE)
 DEF_OP(CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX)
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h
@ -0,0 +1,47 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_H
 #define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_H
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_param
 {
    struct _custom_tiny_yolov4_postprocess_local_data_t* local;
    // Add parameters here
 } vsi_nn_custom_tiny_yolov4_postprocess_param;
 _compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_param, local) == 0, \
    vsi_nn_custom_tiny_yolov4_postprocess_h );
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h
@ -0,0 +1,49 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX_H
 #define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX_H
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_box_param
 {
    struct _custom_tiny_yolov4_postprocess_box_local_data_t* local;
    // Add parameters here
    float bias_0;
    float bias_1;
 } vsi_nn_custom_tiny_yolov4_postprocess_box_param;
 _compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_box_param, local) == 0, \
    vsi_nn_custom_tiny_yolov4_postprocess_box_h );
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h
@ -0,0 +1,47 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #ifndef _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_H
 #define _VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_H
 #include "vsi_nn_types.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct _vsi_nn_custom_tiny_yolov4_postprocess_confidence_param
 {
    struct _custom_tiny_yolov4_postprocess_confidence_local_data_t* local;
    // Add parameters here
 } vsi_nn_custom_tiny_yolov4_postprocess_confidence_param;
 _compiler_assert(offsetof(vsi_nn_custom_tiny_yolov4_postprocess_confidence_param, local) == 0, \
    vsi_nn_custom_tiny_yolov4_postprocess_confidence_h );
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
+++ b/src/tim/vx/internal/include/custom/ops/vsi_nn_op_custom_warp_affine.h
@ -38,6 +38,7 @@ typedef struct _vsi_nn_custom_warp_affine_param
    const float *matrix;
    vsi_enum type;
    int32_t size[2];
    vsi_enum rgb_type;
 } vsi_nn_custom_warp_affine_param;
 _compiler_assert(offsetof(vsi_nn_custom_warp_affine_param, local) == 0, \
    vsi_nn_custom_warp_affine_h );
--- a/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
+++ b/src/tim/vx/internal/include/custom/vsi_nn_custom_node_type.h
@ -31,5 +31,8 @@
 #include "custom/ops/vsi_nn_op_custom_warp_affine.h"
 #include "custom/ops/vsi_nn_op_custom_warp_perspective.h"
 #include "custom/ops/vsi_nn_op_custom_sample.h"
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess.h"
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_confidence.h"
 #include "custom/ops/vsi_nn_op_custom_tiny_yolov4_postprocess_box.h"
 #endif
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -193,3 +193,4 @@ DEF_OP(REVERSESEQUENCE)
 DEF_OP(INVERSE_SIGMOID)
 DEF_OP(GRID_SAMPLE)
 DEF_OP(LPNORM)
 DEF_OP(RESIZE_3D)
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@ -20,4 +20,3 @@ DEF_OP(SPACE2DEPTH_INTERNAL)
 DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
 DEF_OP(GRUCELL_ACTIVATION_Z_H)
 DEF_OP(REDUCE_MEAN_INTERNAL)
 DEF_OP(BILINEAR_GRID_SAMPLE)
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@ -79,6 +79,8 @@ typedef enum
    BOOL8,
    I4,
    U4,
    FP8_E4M3,
    FP8_E5M2,
 } VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e;
 typedef enum
@ -89,6 +91,8 @@ typedef enum
    VSI_NN_KERNEL_QUANT_ASYMM_PERCHANNEL,
    VSI_NN_KERNEL_QUANT_SYMM,
    VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL,
    VSI_NN_KERNEL_QUANT_FLOAT8,
    VSI_NN_KERNEL_QUANT_FLOAT8_PERCHANNEL,
    VSI_NN_KERNEL_QUANT_TYPE_NUM
 } vsi_nn_kernel_quant_type_e;
@ -522,6 +526,10 @@ static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
        return BF16;
    case VSI_NN_TYPE_FLOAT32:
        return F32;
    case VSI_NN_TYPE_FLOAT8_E4M3:
        return FP8_E4M3;
    case VSI_NN_TYPE_FLOAT8_E5M2:
        return FP8_E5M2;
    default:
        VSILOGE("error data type %d", dtype);
        break;
@ -579,6 +587,8 @@ static VSI_INLINE_API size_t vsi_nn_kernel_dtype_get_bytes
        case I8:
        case U8:
        case BOOL8:
        case FP8_E4M3:
        case FP8_E5M2:
            return sizeof(int8_t);
        case I16:
        case U16:
@ -611,6 +621,8 @@ static VSI_INLINE_API vsi_size_t vsi_nn_kernel_dtype_get_bits
        case I8:
        case U8:
        case BOOL8:
        case FP8_E4M3:
        case FP8_E5M2:
            return 8;
        case I16:
        case U16:
@ -879,7 +891,7 @@ static VSI_INLINE_API void vsi_nn_kernel_tensor_attr_get_stride
    shape = attr->shape->data;
    type_bits = vsi_nn_kernel_dtype_get_bits( attr->dtype );
-    if ( type_bits < BITS_PER_BYTE )
+    if ( type_bits < BITS_PER_BYTE && type_bits != 0)
    {
        vsi_size_t i;
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
@ -91,4 +91,21 @@ vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape
    vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size
    );
 vsi_bool vsi_nn_kernel_optimize_matrixmul_broadcast_shape
    (
    const vsi_size_t * shape_x,
    const vsi_size_t * shape_y,
    const vsi_size_t * shape_output,
    vsi_size_t rank_x,
    vsi_size_t rank_y,
    vsi_size_t rank_out,
    vsi_size_t* out_shape_x,
    vsi_size_t* out_shape_y,
    vsi_size_t* out_shape_output,
    uint32_t* new_rank,
    uint32_t* cross_flg,
    uint32_t* size_axis_inner_outer,
    uint32_t* strides_axis_inner_outer
    );
 #endif
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
@ -82,6 +82,12 @@ typedef struct _vsi_nn_pre_process_param
    vsi_nn_pre_process_type_e type;
    struct
    {
        float   mean[3];
        float   scale[3];
    } norm2;
    vsi_nn_pre_process_lcl_data *local;
 } vsi_nn_pre_process_param;
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
@ -65,6 +65,10 @@ typedef struct _vsi_nn_pre_process_bgra_param
    vsi_bool reverse_channel;
    float r_scale;
    float g_scale;
    float b_scale;
    /* pre process rgb layer local data structure */
    vsi_nn_pre_process_bgra_lcl_data local;
 } vsi_nn_pre_process_bgra_param;
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h
@ -70,6 +70,10 @@ typedef struct _vsi_nn_pre_process_nv12_param
    vsi_nn_pre_process_nv12_lcl_data* local;
    vsi_nn_nv_type nv_type;
    float r_scale;
    float g_scale;
    float b_scale;
 } vsi_nn_pre_process_nv12_param;
 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
@ -76,6 +76,9 @@ typedef struct _vsi_nn_pre_process_rgb_param
    vsi_bool reverse_channel;
    float r_scale;
    float g_scale;
    float b_scale;
    /* pre process rgb layer local data structure */
    vsi_nn_pre_process_rgb_lcl_data local;
 } vsi_nn_pre_process_rgb_param;
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb888_planar.h
@ -53,6 +53,15 @@ typedef struct _vsi_nn_pre_process_rgb888_planar_param
    float g_mean;
    float b_mean;
    float scale;
    vsi_bool reverse_channel;
    vsi_bool enable_rgb88_planar_nhwc;
    float r_scale;
    float g_scale;
    float b_scale;
 } vsi_nn_pre_process_rgb888_planar_param;
 _compiler_assert(offsetof(vsi_nn_pre_process_rgb888_planar_param, local) == 0, \
    vsi_nn_pre_process_rgb888_planar_h );
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h
@ -66,6 +66,11 @@ typedef struct _vsi_nn_pre_process_yuv420_param
    float rgb_scale;
    vsi_bool reverse_channel;
    float r_scale;
    float g_scale;
    float b_scale;
    /* local data must be the first. */
    vsi_nn_pre_process_yuv420_lcl_data local;
 } vsi_nn_pre_process_yuv420_param;
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
@ -71,6 +71,10 @@ typedef struct _vsi_nn_pre_process_yuv422_param
    float rgb_scale;
    vsi_bool reverse_channel;
    float r_scale;
    float g_scale;
    float b_scale;
 } vsi_nn_pre_process_yuv422_param;
 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h
@ -66,6 +66,10 @@ typedef struct _vsi_nn_pre_process_yuv444_param
    float rgb_scale;
    vsi_bool reverse_channel;
    float r_scale;
    float g_scale;
    float b_scale;
    /* local data must be the first. */
    vsi_nn_pre_process_yuv444_lcl_data* local;
 } vsi_nn_pre_process_yuv444_param;
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bilinear_grid_sample.h
@ -22,8 +22,8 @@
 *
 *****************************************************************************/
-#ifndef _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H
+#ifndef _VSI_NN_OP_RESIZE_3D_H
-#define _VSI_NN_OP_BILINEAR_GRID_SAMPLE_H
+#define _VSI_NN_OP_RESIZE_3D_H
 #include "vsi_nn_types.h"
@ -31,17 +31,19 @@
 extern "C" {
 #endif
 typedef struct _vsi_nn_resize_3d_local_data {
    vsi_bool use_internal_node;
 } vsi_nn_resize_3d_local_data;
-typedef struct _vsi_nn_bilinear_grid_sample_param
+typedef struct _vsi_nn_resize_3d_param
 {
-    struct _bilinear_grid_sample_local_data_t* local;
+    vsi_nn_resize_3d_local_data* lcl_data;
-    vsi_bool align_corners;
+    vsi_enum   type;
-    vsi_nn_pad_mode_e padding_mode;
+    float      factor;
-    int32_t const_val;
+    int32_t    size[3];
-} vsi_nn_bilinear_grid_sample_param;
+    vsi_bool   align_corners;
-
+    vsi_bool   half_pixel_centers;
-_compiler_assert(offsetof(vsi_nn_bilinear_grid_sample_param, local) == 0, \
+} vsi_nn_resize_3d_param;
    vsi_nn_bilinear_grid_sample_h );
 #ifdef __cplusplus
 }
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
@ -33,6 +33,7 @@ extern "C" {
 typedef struct _vsi_nn_topk_param
 {
    uint32_t     k;
    int32_t      axis;
 } vsi_nn_topk_param;
 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
@ -52,7 +52,9 @@ enum {
    D_BF16 = VSI_NN_TYPE_BFLOAT16,
    D_BOOL8 = VSI_NN_TYPE_BOOL8,
    D_I4 = VSI_NN_TYPE_INT4,
-    D_U4 = VSI_NN_TYPE_UINT4
+    D_U4 = VSI_NN_TYPE_UINT4,
    D_F8_E4M3 = VSI_NN_TYPE_FLOAT8_E4M3,
    D_F8_E5M2 = VSI_NN_TYPE_FLOAT8_E5M2
 };
 /* short alias for qtype */
@ -63,6 +65,8 @@ enum {
    Q_ASYM = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC << Q_SHIFT,
    Q_SYM_PC = VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC << Q_SHIFT,
    Q_SYM = VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC << Q_SHIFT,
    Q_FP8 = VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 << Q_SHIFT,
    Q_FP8_PC = VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 << Q_SHIFT,
 };
 typedef struct {
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@ -27,6 +27,7 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_math.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_log.h"
 #ifdef __cplusplus
 extern "C" {
@ -78,6 +79,8 @@ static VSI_INLINE_API vsi_bool type_is_signed
    case VSI_NN_TYPE_FLOAT32:
    case VSI_NN_TYPE_FLOAT64:
    case VSI_NN_TYPE_BFLOAT16:
    case VSI_NN_TYPE_FLOAT8_E4M3:
    case VSI_NN_TYPE_FLOAT8_E5M2:
        ret = TRUE;
        break;
    default:
@ -93,9 +96,14 @@ static VSI_INLINE_API uint32_t type_get_bytes
 {
    switch( type )
    {
    case VSI_NN_TYPE_INT4:
    case VSI_NN_TYPE_UINT4:
        return 0;
    case VSI_NN_TYPE_INT8:
    case VSI_NN_TYPE_UINT8:
    case VSI_NN_TYPE_BOOL8:
    case VSI_NN_TYPE_FLOAT8_E4M3:
    case VSI_NN_TYPE_FLOAT8_E5M2:
        return 1;
    case VSI_NN_TYPE_INT16:
    case VSI_NN_TYPE_UINT16:
@ -111,7 +119,8 @@ static VSI_INLINE_API uint32_t type_get_bytes
    case VSI_NN_TYPE_FLOAT64:
        return 8;
    default:
-        return 0;
+        VSILOGE("unsupported type: %d", type);
        return 1;
    }
 } /* type_get_bytes() */
@ -128,6 +137,8 @@ static VSI_INLINE_API uint32_t type_get_bits
    case VSI_NN_TYPE_INT8:
    case VSI_NN_TYPE_UINT8:
    case VSI_NN_TYPE_BOOL8:
    case VSI_NN_TYPE_FLOAT8_E4M3:
    case VSI_NN_TYPE_FLOAT8_E5M2:
        return 8;
    case VSI_NN_TYPE_INT16:
    case VSI_NN_TYPE_UINT16:
@ -143,7 +154,8 @@ static VSI_INLINE_API uint32_t type_get_bits
    case VSI_NN_TYPE_FLOAT64:
        return 64;
    default:
-        return 0;
+        VSILOGE("unsupported type: %d", type);
        return 1;
    }
 } /* type_get_bits() */
@ -236,6 +248,7 @@ static VSI_INLINE_API float affine_to_fp32
    )
 {
    float data;
    VSI_UNREFERENCED(type);
    data = ( (float)val - zero_point ) * scale;
    return data;
 } /* affine_to_fp32() */
@ -279,6 +292,7 @@ static VSI_INLINE_API float dfp_to_fp32
    )
 {
    float result;
    VSI_UNREFERENCED(type);
    if( fl > 0 )
    {
        result = (float)val * ( 1.0f / ( (float) ( (int64_t)1 << fl ) ) );
@ -440,6 +454,139 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
    return out;
 } /* fp32_to_bfp16_rtne */
 #define FLOAT_BIAS_EXPONENT 127
 #define FLOAT_EXPONENT_SIZE 8
 #define FLOAT_MANTISSA_SIZE  23
 #define FLOAT8_E4M3_BIAS_EXPONENT 7
 #define FLOAT8_E4M3_EXPONENT_SIZE 4
 #define FLOAT8_E4M3_MANTISSA_SIZE 3
 #define FLOAT8_E5M2_BIAS_EXPONENT 15
 #define FLOAT8_E5M2_EXPONENT_SIZE 5
 #define FLOAT8_E5M2_MANTISSA_SIZE 2
 static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
    float fp8_f32 = in / scale;
    int32_t fp8_i32 = *((int32_t*)&fp8_f32);
    //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 3) - 1));
    int32_t eps = 1 << (23 - 3 - 1);
    fp8_i32 += eps;
    //fp8_i32 &= mask;
    {
        int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
        int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
        int expShiftValue = FLOAT8_E4M3_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
        int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7;
        exp = (exp + expShiftValue) & 0xF;
        return (uint8_t)(sign << 7 | exp << 3 | mantissa);
    }
 } /* fp32_to_fp8_e4m3() */
 static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) {
    float fp8_f32 = in / scale;
    int32_t fp8_i32 = *((int32_t*)&fp8_f32);
    //int32_t mask = (int32_t)(pow(2, 32) - 1 - (pow(2, 23 - 2) - 1));
    int32_t eps = 1 << (23 - 2 - 1);
    fp8_i32 += eps;
    //fp8_i32 &= mask;
    {
        int sign = (fp8_i32 >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1;
        int exp = (fp8_i32 >> FLOAT_MANTISSA_SIZE) & 0xff;
        int expShiftValue = FLOAT8_E5M2_BIAS_EXPONENT - FLOAT_BIAS_EXPONENT;
        int mantissa = (fp8_i32 >> (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x3;
        exp = (exp + expShiftValue) & 0x1F;
        return (uint8_t)(sign << 7 | exp << 2 | mantissa);
    }
 } /* fp32_to_fp8_e5m2() */
 static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
    float val_fp32;
    uint32_t signOut = 0;
    uint32_t exponentOut = 0;
    uint32_t mantissaOut = 0;
    uint32_t out_u = 0;
    uint32_t signIn;
    uint32_t exponentIn;
    uint32_t mantissaIn;
    int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E4M3_BIAS_EXPONENT;
    signIn = (in >> (FLOAT8_E4M3_EXPONENT_SIZE + FLOAT8_E4M3_MANTISSA_SIZE)) & 0x1;
    exponentIn = (in >> FLOAT8_E4M3_MANTISSA_SIZE) & 0xF;
    mantissaIn = in & 0x7;
    signOut = signIn;
    if (exponentIn == 0 && mantissaIn == 0)
    {
        goto final;
    }
    if (exponentIn == 0xf && mantissaIn == 0x7)
    {
        exponentOut = 0xff;
        mantissaOut = 0x400000;
        goto final;
    }
    exponentOut = (exponentIn + expShiftValue) & 0xff;
    mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E4M3_MANTISSA_SIZE)) & 0x7fffff;
 final:
    out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
    val_fp32 = *((float*)&out_u);
    return val_fp32 * scale;
 } /* fp8_e4m3_to_fp32() */
 static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
    float val_fp32;
    uint32_t signOut = 0;
    uint32_t exponentOut = 0;
    uint32_t mantissaOut = 0;
    uint32_t out_u = 0;
    uint32_t signIn;
    uint32_t exponentIn;
    uint32_t mantissaIn;
    int expShiftValue = FLOAT_BIAS_EXPONENT - FLOAT8_E5M2_BIAS_EXPONENT;
    signIn = (in >> 7) & 0x1;
    exponentIn = (in >> 2) & 0x1F;
    mantissaIn = in & 0x3;
    signOut = signIn;
    if (exponentIn == 0 && mantissaIn == 0)
    {
        goto final;
    }
    if (exponentIn == 0x1f && mantissaIn == 0x3)
    {
        exponentOut = 0xff;
        mantissaOut = 0x400000;
        goto final;
    }
    exponentOut = (exponentIn + expShiftValue) & 0xff;
    mantissaOut = (mantissaIn << (FLOAT_MANTISSA_SIZE - FLOAT8_E5M2_MANTISSA_SIZE)) & 0x7fffff;
 final:
    out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
    val_fp32 = *((float*)&out_u);
    return val_fp32 * scale;
 } /* fp8_e5m2_to_fp32() */
 static VSI_INLINE_API vsi_status dtype_to_float32
    (
    uint8_t *src,
@ -458,6 +605,12 @@ static VSI_INLINE_API vsi_status dtype_to_float32
    case VSI_NN_TYPE_BFLOAT16:
        *dst = bfp16_to_fp32( *(int16_t *)src );
        break;
    case VSI_NN_TYPE_FLOAT8_E4M3:
        *dst = fp8_e4m3_to_fp32(*(int8_t*)src, src_dtype->scale);
        break;
    case VSI_NN_TYPE_FLOAT8_E5M2:
        *dst = fp8_e5m2_to_fp32(*(int8_t *)src, src_dtype->scale);
        break;
    case VSI_NN_TYPE_INT4:
    case VSI_NN_TYPE_UINT4:
    case VSI_NN_TYPE_INT8:
@ -511,6 +664,12 @@ static VSI_INLINE_API vsi_status float32_to_dtype
    case VSI_NN_TYPE_BFLOAT16:
        *(int16_t *)dst = fp32_to_bfp16_rtne( src );
        break;
    case VSI_NN_TYPE_FLOAT8_E4M3:
        *(int8_t *)dst = fp32_to_fp8_e4m3(src, dst_dtype->scale);
        break;
    case VSI_NN_TYPE_FLOAT8_E5M2:
        *(int8_t *)dst = fp32_to_fp8_e5m2(src, dst_dtype->scale);
        break;
    case VSI_NN_TYPE_INT4:
    case VSI_NN_TYPE_UINT4:
    case VSI_NN_TYPE_INT8:
--- a/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_link_list.h
@ -30,7 +30,7 @@
 extern "C"{
 #endif
-#define vsi_nn_LinkListInitRoot(n) do{n = NULL;} while (0);
+#define vsi_nn_LinkListInitRoot(n) {n = NULL;}
 typedef struct _vsi_nn_link_list
 {
--- a/src/tim/vx/internal/include/utils/vsi_nn_math.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h
@ -53,12 +53,13 @@ extern "C" {
 #define DEFINE_ARRAY_TYPE( NAME, TYPE ) \
    typedef struct { \
        size_t size; \
-        TYPE data[0]; \
+        TYPE *data; \
    } vsi_##NAME##_array_t; \
    static VSI_INLINE_API vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
-        vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \
+        vsi_##NAME##_array_t * array = NULL; \
-                sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
+        array = (vsi_##NAME##_array_t *)malloc( sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
        if (array == NULL) return NULL; \
        array->data = (TYPE *)(((TYPE**)(&(array->data))) + 1); \
        array->size = size; \
        return array; \
    } \
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@ -50,14 +50,23 @@ extern "C" {
    free( _PTR ); _PTR = NULL; }
 #define vsi_safe_release_tensor(_t) if(_t){vsi_nn_ReleaseTensor(&(_t)); _t = NULL;}
-
+#if (defined(_WIN32) || defined(__WIN32__) || defined(WIN32))
-#define END_OF_VARIADIC_ARGUMENTS       ((size_t)0xbadcaffebadcaffe)
+    #if defined(_WIN64)
        #define END_OF_VARIADIC_ARGUMENTS       ((size_t)0xbadcaffebadcaffe)
    #else
        #define END_OF_VARIADIC_ARGUMENTS       ((size_t)0xbadcaffe)
    #endif
 #else
    #define END_OF_VARIADIC_ARGUMENTS       ((size_t)0xbadcaffebadcaffe)
 #endif
 #define FOREACH_ARGS(_args, _next, _arg_type) \
    while(((_arg_type)((size_t)END_OF_VARIADIC_ARGUMENTS)) != (_next = va_arg(_args, _arg_type)))
 #define BITS_PER_BYTE 8
 #define VSI_UNREFERENCED( param ) ( ( void ) ( param ) )
 #define VSI_NN_STRINGIZE(X) VSI_NN_DO_STRINGIZE(X)
 #define VSI_NN_DO_STRINGIZE(X) #X
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -78,6 +78,7 @@ typedef struct _vsi_nn_runtime_option_t
    int32_t enable_asymi8_to_u8;
    int32_t enable_dataconvert_optimize;
    int32_t enable_stream_processor;
    int32_t enable_rgb88_planar_nhwc;
 } vsi_nn_runtime_option_t;
 /**
--- a/src/tim/vx/internal/include/vsi_nn_error.h
+++ b/src/tim/vx/internal/include/vsi_nn_error.h
@ -31,33 +31,42 @@
 #define VSI_ASSERT( cond )  assert(cond)
 #define VSI_CHECK_PTR( pointer, msg, retval ) \
-    do { \
+    { \
        if( pointer == NULL ) { \
            VSILOGD("%s",msg); \
            VSI_ASSERT(FALSE); \
        } \
-    } while(0)
+    }
-#define CHECK_STATUS_FAIL_GOTO( stat, lbl )  do {\
+#define CHECK_STATUS_FAIL_GOTO( stat, lbl )  {\
    if( VSI_SUCCESS != stat ) {\
        VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\
        goto lbl;\
    }\
-} while(0)
+}
-#define CHECK_STATUS( stat )  do {\
+#define CHECK_STATUS( stat )  {\
    if( VSI_SUCCESS != stat ) {\
        VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\
    }\
-} while(0)
+}
 #define CHECK_PTR_FAIL_GOTO( pointer, msg, lbl ) \
-    do { \
+    { \
        if( pointer == NULL ) { \
            VSILOGD("CHECK POINTER %s", msg); \
            goto lbl; \
        } \
-    } while(0)
+    }
 #define CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE( pointer, node, msg, lbl ) \
    { \
        if( pointer == NULL ) { \
            vsi_nn_internal_release_node(&node);\
            VSILOGD("CHECK POINTER %s", msg); \
            goto lbl; \
        } \
    }
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@ -1,26 +1,3 @@
 /****************************************************************************
 *
 *    Copyright (c) 2019 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the Software),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 /*****Auto generated header file, Please DO NOT modify manually!*****/
 #ifndef _VSI_NN_FEATURE_CONFIG_H
 #define _VSI_NN_FEATURE_CONFIG_H
@ -42,5 +19,6 @@
 #if defined(VX_TENSORVIEW_ON_ANY_DIM) && VX_TENSORVIEW_ON_ANY_DIM
 #define VSI_CONCAT_ENHANCE_SUPPORT
 #endif
 #define VSI_CREATE_TENSOR_FROM_VIEW_SUPPORT
 #endif
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@ -361,6 +361,27 @@ OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle
    uint8_t             * data
    );
 /**
 * Add a new tensor from view
 * Create a new tensor from a view and add it to graph.
 *
 * @param[in] graph Graph handle.
 * @param[in] id Required, the id of the parent tensor on which to create view.
 * @param[in] start The start cooridinates for each dim, 0-based none-negative interger.
 *             NULL means copy from the idx 0 of each dim.
 * @param[in] end The end cooridinates for each dim, 0-based none-negative interger.
 *             NULL means copy to the end of each dim. For the given idx, the end[idx]
 *             should be greater than start[idx].
 * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse.
 */
 OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
    (
    vsi_nn_graph_t* graph,
    vsi_nn_tensor_id_t id,
    vsi_size_t* start,
    vsi_size_t* end
    );
 /**
 * Attach tensor to graph
 * Attach an exist tensor to graph.
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -206,8 +206,8 @@
 #include "ops/vsi_nn_op_maxunpool.h"
 #include "ops/vsi_nn_op_reversesequence.h"
 #include "ops/vsi_nn_op_grid_sample.h"
 #include "ops/vsi_nn_op_bilinear_grid_sample.h"
 #include "ops/vsi_nn_op_lpnorm.h"
 #include "ops/vsi_nn_op_resize_3d.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -402,8 +402,8 @@ typedef union _vsi_nn_nn_param
    vsi_nn_reversesequence_param    reversesequence;
    vsi_nn_inverse_sigmoid_param       inverse_sigmoid;
    vsi_nn_grid_sample_param        gridsample;
    vsi_nn_bilinear_grid_sample_param bilinear_grid_sample;
    vsi_nn_lpnorm_param             lpnorm;
    vsi_nn_resize_3d_param          resize_3d;
    void*                         client_param;
    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@ -48,6 +48,7 @@ typedef enum
    VSI_NN_PREPROCESS_IMAGE_RESIZE_BILINEAR,
    VSI_NN_PREPROCESS_IMAGE_RESIZE_NEAREST,
    VSI_NN_PREPROCESS_DTYPE_CONVERT,
    VSI_NN_PREPROCESS_MEANS_AND_SCALES,
 } vsi_nn_preprocess_type_e;
 /**
@ -150,8 +151,25 @@ typedef struct
    float scale;
 }vsi_nn_process_mean_and_scale_t;
 /**
 * Process mean and scale parameter structure
 */
 typedef struct
 {
    /** Mean value for each channel */
    float* channel_mean;
    /*Channel length */
    int32_t channel_len;
    /** Scale value */
    float* scale;
    /** Scale length */
    int32_t scale_len;
 }vsi_nn_process_means_and_scales_t;
 typedef vsi_nn_process_mean_and_scale_t vsi_nn_preprocess_mean_and_scale_t;
 typedef vsi_nn_process_means_and_scales_t vsi_nn_preprocess_means_and_scales_t;
 typedef vsi_nn_process_mean_and_scale_t vsi_nn_postprocess_mean_and_scale_t;
 typedef vsi_nn_process_means_and_scales_t vsi_nn_postprocess_means_and_scales_t;
 /**
 * Process permute parameter structure
--- a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h
+++ b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h
@ -154,7 +154,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_transpose_time_major
    vsi_bool use_virtual_tensor
    );
-void vsi_nn_rnn_split_input_tensor
+vsi_status vsi_nn_rnn_split_input_tensor
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t * input,
@ -163,7 +163,7 @@ void vsi_nn_rnn_split_input_tensor
    vsi_bool use_virtual_tensor
    );
-void vsi_nn_rnn_data_check_aligned
+vsi_status vsi_nn_rnn_data_check_aligned
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** input,
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@ -82,6 +82,10 @@ typedef enum
    VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC = 0x4,
    /** affine perchannel asymmetric */
    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC = 0x5,
    /** float8 */
    VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
    /** perchannel float8 */
    VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
    /** undefined type */
    VSI_NN_QNT_TYPE_NA = 0xff,
 } vsi_nn_qnt_type_e;
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@ -734,13 +734,15 @@ vsi_status vsi_nn_copy_tensor_veiw_patch
 /**
 * OVXLIB internal tensor util api
 * A wrapper api for OpenVX vxCopyTensorPatch
- * Allows the application to copy whole tensor patch from/into an tensor object.
+ * Allows the application to copy partial/whole tensor patch from/into an tensor object.
 *
 * @param[in] tensor OpenVX Tensor handle.
 * @param[in] attr OVXLIB Tensor attr.
 * @param[in] user_ptr The address of the memory location where to store the requested data.
 * @param[in] usage This declares the effect of the copy with regard to the tensor object
 *            support VX_READ_ONLY or VX_WRITE_ONLY
 * @param[in] start The start cooridinates for each dim. NULL means copy from the idx 0 of each dim.
 * @param[in] end The end cooridinates for each dim. NULL means copy to the end of each dim.
 * @return VSI_SUCCESS on success, or error core otherwise.
 */
 vsi_status vsi_nn_copy_tensor_patch
@ -748,7 +750,9 @@ vsi_status vsi_nn_copy_tensor_patch
    vx_tensor tensor,
    vsi_nn_tensor_attr_t *attr,
    void * user_ptr,
-    vsi_enum usage
+    vsi_enum usage,
    vsi_size_t* start,
    vsi_size_t* end
    );
 /**
--- a/src/tim/vx/internal/include/vsi_nn_test.h
+++ b/src/tim/vx/internal/include/vsi_nn_test.h
@ -31,26 +31,26 @@
 extern "C"{
 #endif
-#define TEST_CHECK_TENSOR_ID( id, lbl )      do {\
+#define TEST_CHECK_TENSOR_ID( id, lbl )      {\
    if( VSI_NN_TENSOR_ID_NA == id ) {\
        VSILOGE("CHECK TENSOR ID %d", __LINE__);\
        goto lbl;\
        }\
-    } while(0)
+    }
-#define TEST_CHECK_PTR( ptr, lbl )      do {\
+#define TEST_CHECK_PTR( ptr, lbl )      {\
    if( NULL == ptr ) {\
        VSILOGE("CHECK PTR %d", __LINE__);\
        goto lbl;\
    }\
-} while(0)
+}
-#define TEST_CHECK_STATUS( stat, lbl )  do {\
+#define TEST_CHECK_STATUS( stat, lbl )  {\
    if( VSI_SUCCESS != stat ) {\
        VSILOGE("CHECK STATUS(%d:%s)", (stat), vsi_nn_DescribeStatus(stat));\
        goto lbl;\
    }\
-} while(0)
+}
 #if defined(__cplusplus)
 }
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@ -191,6 +191,16 @@ typedef enum
    VSI_NN_TYPE_BFLOAT16 = VX_TYPE_BFLOAT16,
 #else
    VSI_NN_TYPE_BFLOAT16 = 0x81A,
 #endif
 #ifdef VSI_NN_TYPE_FLOAT8_E4M3_SUPPORT
    VSI_NN_TYPE_FLOAT8_E4M3 = VX_TYPE_FLOAT8_E4M3,
 #else
    VSI_NN_TYPE_FLOAT8_E4M3 = 0X81E,
 #endif
 #ifdef VSI_NN_TYPE_FLOAT8_E5M2_SUPPORT
    VSI_NN_TYPE_FLOAT8_E5M2 = VX_TYPE_FLOAT8_E5M2,
 #else
    VSI_NN_TYPE_FLOAT8_E5M2 = 0X81F,
 #endif
    VSI_NN_TYPE_VDATA = VX_TYPE_USER_STRUCT_START + 0x1,
@ -268,6 +278,11 @@ typedef enum _vsi_nn_roi_align_type_e
    VSI_NN_ROI_ALIGN
 } vsi_nn_roi_align_type_e;
 typedef enum _vsi_nn_custom_warp_affine_type_e {
    VSI_NN_WARP_AFFINE_TYPE_NONE = 0,
    VSI_NN_WARP_AFFINE_TYPE_RGB
 } vsi_nn_custom_warp_affine_type_e;
 /** Deprecated */
 typedef uint32_t vsi_nn_size_t;
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 74
+#define VSI_NN_VERSION_PATCH 84
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
--- a/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c
+++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess.c
@ -0,0 +1,578 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
 typedef struct _custom_tiny_yolov4_postprocess_local_data_t {
    vx_int32 begin_dims[6][VSI_NN_MAX_DIM_NUM];
    vx_int32 end_dims[6][VSI_NN_MAX_DIM_NUM];
    vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM];
 } custom_tiny_yolov4_postprocess_local_data_t;
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (4)
 #define _OUTPUT_NUM         (2)
 static vsi_nn_internal_tensor_t *_create_internal_tensor
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t * input
    )
 {
    vsi_nn_tensor_attr_t attr;
    vsi_nn_internal_tensor_t * tensor = NULL;
    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
    memcpy( &attr.dtype, &input->attr.dtype, sizeof( attr.dtype ) );
    attr.dim_num = VSI_NN_DIM_AUTO;
    attr.vtl = TRUE;
    attr.is_const = FALSE;
    tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
    return tensor;
 } /* _create_internal_tensor() */
 static vsi_nn_internal_tensor_t *_create_sigmoid_internal_tensor
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t * input
    )
 {
    vsi_nn_tensor_attr_t attr;
    vsi_nn_internal_tensor_t * tensor = NULL;
    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
    memcpy( &attr.dtype, &input->attr.dtype, sizeof( attr.dtype ) );
    if (attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ||
        attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC)
    {
        attr.dtype.scale = 0.00390625;
        attr.dtype.zero_point = 0;
    }
    attr.dim_num = VSI_NN_DIM_AUTO;
    attr.vtl = TRUE;
    attr.is_const = FALSE;
    tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
    return tensor;
 } /* _create_sigmoid_internal_tensor() */
 static vsi_nn_internal_tensor_t *_create_output_internal_tensor
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t * output
    )
 {
    vsi_nn_tensor_attr_t attr;
    vsi_nn_internal_tensor_t * tensor = NULL;
    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
    memcpy( &attr.dtype, &output->attr.dtype, sizeof( attr.dtype ) );
    attr.dim_num = VSI_NN_DIM_AUTO;
    attr.vtl = TRUE;
    attr.is_const = FALSE;
    tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
    return tensor;
 } /* _create_output_internal_tensor() */
 static vsi_nn_internal_tensor_t *_create_strided_slice_op
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t * input,
    int32_t begin_mask,
    int32_t end_mask,
    int32_t index
    )
 {
    vsi_nn_custom_tiny_yolov4_postprocess_param * p = NULL;
    vsi_nn_internal_tensor_t * tensor = NULL;
    vsi_nn_internal_node_t* curr = NULL;
    p = (vsi_nn_custom_tiny_yolov4_postprocess_param *)&(self->nn_param.custom_tiny_yolov4_postprocess);
    tensor = _create_internal_tensor(self, input);
    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 );
    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
    curr->node->nn_param.strided_slice.begin_dims = p->local->begin_dims[index];
    curr->node->nn_param.strided_slice.begin_dims_num = input->attr.dim_num;
    curr->node->nn_param.strided_slice.end_dims = p->local->end_dims[index];
    curr->node->nn_param.strided_slice.end_dims_num = input->attr.dim_num;
    curr->node->nn_param.strided_slice.stride_dims = p->local->stride_dims;
    curr->node->nn_param.strided_slice.stride_dims_num = input->attr.dim_num;
    curr->node->nn_param.strided_slice.begin_mask = begin_mask;
    curr->node->nn_param.strided_slice.end_mask = end_mask;
    curr->node->nn_param.strided_slice.shrink_axis_mask = 0;
    curr->node->nn_param.strided_slice.new_axis_mask = 0;
    curr->inputs[0] = input;
    curr->outputs[0] = tensor->t;
    vsi_nn_internal_setup_node( self, curr );
 final:
    return tensor;
 } /* _create_strided_slice() */
 static vsi_nn_internal_tensor_t *_create_sigmoid_op
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t * input
    )
 {
    vsi_nn_internal_tensor_t * tensor = NULL;
    vsi_nn_internal_node_t* curr = NULL;
    tensor = _create_sigmoid_internal_tensor(self, input);
    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SIGMOID, 0, 0 );
    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
    curr->inputs[0] = input;
    curr->outputs[0] = tensor->t;
    vsi_nn_internal_setup_node( self, curr );
 final:
    return tensor;
 } /* _create_sigmoid_op() */
 static vsi_nn_internal_tensor_t *_create_confidence_op
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t * input,
    vsi_nn_tensor_t * output
    )
 {
    vsi_nn_internal_tensor_t * tensor = NULL;
    vsi_nn_internal_node_t* curr = NULL;
    tensor = _create_output_internal_tensor(self, output);
    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE, 0, 0 );
    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
    curr->inputs[0] = input;
    curr->outputs[0] = tensor->t;
    vsi_nn_internal_setup_node( self, curr );
 final:
    return tensor;
 } /* _create_confidence_op() */
 static vsi_nn_internal_tensor_t *_create_box_op
    (
    vsi_nn_node_t *   self,
    vsi_nn_tensor_t * input0,
    vsi_nn_tensor_t * input1,
    vsi_nn_tensor_t * output,
    float             bias0,
    float             bias1
    )
 {
    vsi_nn_internal_tensor_t * tensor = NULL;
    vsi_nn_internal_node_t* curr = NULL;
    tensor = _create_output_internal_tensor(self, output);
    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX, 0, 0 );
    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
    curr->inputs[0] = input0;
    curr->inputs[1] = input1;
    curr->outputs[0] = tensor->t;
    curr->node->nn_param.custom_tiny_yolov4_postprocess_box.bias_0 = bias0;
    curr->node->nn_param.custom_tiny_yolov4_postprocess_box.bias_1 = bias1;
    vsi_nn_internal_setup_node( self, curr );
 final:
    return tensor;
 } /* _create_box_op() */
 static vsi_nn_internal_tensor_t *_create_reshape_op
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t * input,
    vsi_nn_tensor_t * output,
    vsi_size_t        width
    )
 {
    vsi_nn_internal_tensor_t * tensor = NULL;
    vsi_nn_internal_node_t* curr = NULL;
    vsi_size_t shape_1[] = { 1, (vsi_size_t)-1, 1 };
    shape_1[0] = width;
    tensor = _create_output_internal_tensor(self, output);
    CHECK_PTR_FAIL_GOTO( tensor, "Create internal tensor fail.", final );
    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
    curr->inputs[0] = input;
    curr->outputs[0] = tensor->t;
    curr->node->nn_param.reshape2.size = shape_1;
    curr->node->nn_param.reshape2.dim_num = 3;
    vsi_nn_internal_setup_node( self, curr );
 final:
    return tensor;
 } /* _create_reshape_op() */
 static vsi_bool _create_concat_op
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t * input0,
    vsi_nn_tensor_t * input1,
    vsi_nn_tensor_t * input2,
    vsi_nn_tensor_t * input3,
    vsi_nn_tensor_t * input4,
    vsi_nn_tensor_t * input5,
    vsi_nn_tensor_t * output
    )
 {
    vsi_nn_internal_node_t* curr = NULL;
    vsi_bool ret = FALSE;
    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, 6, 1 );
    CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
    curr->inputs[0] = input0;
    curr->inputs[1] = input1;
    curr->inputs[2] = input2;
    curr->inputs[3] = input3;
    curr->inputs[4] = input4;
    curr->inputs[5] = input5;
    curr->outputs[0] = output;
    curr->node->nn_param.concat.axis = 1;
    ret = vsi_nn_internal_setup_node( self, curr );
 final:
    return ret;
 } /* _create_concat_op() */
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return vsi_nn_internal_compute_node( self );
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    BEGIN_IO_TYPE_DECL(CUSTOM_TINY_YOLOV4_POSTPROCESS, 4, 2)
        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
    END_IO_TYPE_DECL(CUSTOM_TINY_YOLOV4_POSTPROCESS)
    if (!VALIDATE_OP_IO_TYPES(CUSTOM_TINY_YOLOV4_POSTPROCESS, self, inputs,
        self->input.num, outputs, self->output.num))
    {
        char* desc = generate_op_io_types_desc(inputs,
                self->input.num, outputs, self->output.num);
        VSILOGE("Inputs/Outputs data type not support: %s", desc);
        destroy_op_io_types_desc(desc);
        return FALSE;
    }
    return TRUE;
 } /* op_check() */
 static vsi_status op_optimize
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs,
    vsi_nn_opt_direction_e direction
    )
 {
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return vsi_nn_internal_optimize_node( self, direction );
 }
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_bool ret = FALSE;
    vsi_nn_internal_tensor_t * tensor0[12] = {NULL};
    vsi_nn_internal_tensor_t * tensor1[12] = {NULL};
    int32_t index_0 = 1;
    int32_t index_1 = 0;
    int32_t index_2 = 3;
    int32_t index_3 = 2;
    vsi_nn_internal_init_node_wksp( self );
    /**confidence**/
    /**input 0 chunk 0**/
    /*
    sub0:26x26x255 --> 26x26x81, begin: [0, 0, 4, 0] end: [0, 0, 85, 0] stride: [1, 1, 1,  1]
    sub1[26, 26, 80] = sigmoid(sub0)[26, 26, 0:0] * sigmoid(sub0)[26, 26, 1:81]
    sub2[80, 26, 26] = transpose(sub1)
    sub3[80, 676] = reshape(sub2)
    */
    tensor0[0] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 0);
    CHECK_PTR_FAIL_GOTO( tensor0[0], "Create internal tensor fail.", final );
    tensor0[1] = _create_sigmoid_op(self, tensor0[0]->t);
    CHECK_PTR_FAIL_GOTO( tensor0[1], "Create internal tensor fail.", final );
    tensor0[2] = _create_confidence_op(self, tensor0[1]->t, outputs[0]);
    CHECK_PTR_FAIL_GOTO( tensor0[2], "Create internal tensor fail.", final );
    tensor0[3] = _create_reshape_op(self, tensor0[2]->t, outputs[0], 80);
    CHECK_PTR_FAIL_GOTO( tensor0[3], "Create internal tensor fail.", final );
    /**chunk 1**/
    /*
    26x26x255 --> 26x26x81, begin: [0, 0, 89, 0] end: [0, 0, 170, 0] stride: [1, 1, 1,  1]
    */
    tensor0[4] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 1);
    CHECK_PTR_FAIL_GOTO( tensor0[4], "Create internal tensor fail.", final );
    tensor0[5] = _create_sigmoid_op(self, tensor0[4]->t);
    CHECK_PTR_FAIL_GOTO( tensor0[5], "Create internal tensor fail.", final );
    tensor0[6] = _create_confidence_op(self, tensor0[5]->t, outputs[0]);
    CHECK_PTR_FAIL_GOTO( tensor0[6], "Create internal tensor fail.", final );
    tensor0[7] = _create_reshape_op(self, tensor0[6]->t, outputs[0], 80);
    CHECK_PTR_FAIL_GOTO( tensor0[7], "Create internal tensor fail.", final );
    /**chunk 2**/
    /*
    26x26x255 --> 26x26x81, begin: [0, 0, 174, 0] end: [0, 0, 255, 0] stride: [1, 1, 1,  1]
    */
    tensor0[8] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 2);
    CHECK_PTR_FAIL_GOTO( tensor0[8], "Create internal tensor fail.", final );
    tensor0[9] = _create_sigmoid_op(self, tensor0[8]->t);
    CHECK_PTR_FAIL_GOTO( tensor0[9], "Create internal tensor fail.", final );
    tensor0[10] = _create_confidence_op(self, tensor0[9]->t, outputs[0]);
    CHECK_PTR_FAIL_GOTO( tensor0[10], "Create internal tensor fail.", final );
    tensor0[11] = _create_reshape_op(self, tensor0[10]->t, outputs[0], 80);
    CHECK_PTR_FAIL_GOTO( tensor0[11], "Create internal tensor fail.", final );
    /**input 1 chunk 0**/
    /*
    sub0:13x13x255 --> 26x26x81, begin: [0, 0, 4, 0] end: [0, 0, 85, 0] stride: [1, 1, 1,  1]
    sub1[13, 13, 80] = sigmoid(sub0)[13, 13, 0:0] * sigmoid(sub0)[13, 13, 1:81]
    sub2[80, 13, 13] = transpose(sub1)
    sub3[80, 169] = reshape(sub2)
    */
    tensor1[0] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 0);
    CHECK_PTR_FAIL_GOTO( tensor1[0], "Create internal tensor fail.", final );
    tensor1[1] = _create_sigmoid_op(self, tensor1[0]->t);
    CHECK_PTR_FAIL_GOTO( tensor1[1], "Create internal tensor fail.", final );
    tensor1[2] = _create_confidence_op(self, tensor1[1]->t, outputs[0]);
    CHECK_PTR_FAIL_GOTO( tensor1[2], "Create internal tensor fail.", final );
    tensor1[3] = _create_reshape_op(self, tensor1[2]->t, outputs[0], 80);
    CHECK_PTR_FAIL_GOTO( tensor1[3], "Create internal tensor fail.", final );
    /**chunk 1**/
    /*
    13x13x255 --> 13x13x81, begin: [0, 0, 89, 0] end: [0, 0, 170, 0] stride: [1, 1, 1,  1]
    */
    tensor1[4] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 1);
    CHECK_PTR_FAIL_GOTO( tensor1[4], "Create internal tensor fail.", final );
    tensor1[5] = _create_sigmoid_op(self, tensor1[4]->t);
    CHECK_PTR_FAIL_GOTO( tensor1[5], "Create internal tensor fail.", final );
    tensor1[6] = _create_confidence_op(self, tensor1[5]->t, outputs[0]);
    CHECK_PTR_FAIL_GOTO( tensor1[6], "Create internal tensor fail.", final );
    tensor1[7] = _create_reshape_op(self, tensor1[6]->t, outputs[0], 80);
    CHECK_PTR_FAIL_GOTO( tensor1[7], "Create internal tensor fail.", final );
    /**chunk 2**/
    /*
    13x13x255 --> 13x13x81, begin: [0, 0, 174, 0] end: [0, 0, 255, 0] stride: [1, 1, 1,  1]
    */
    tensor1[8] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 2);
    CHECK_PTR_FAIL_GOTO( tensor1[8], "Create internal tensor fail.", final );
    tensor1[9] = _create_sigmoid_op(self, tensor1[8]->t);
    CHECK_PTR_FAIL_GOTO( tensor1[9], "Create internal tensor fail.", final );
    tensor1[10] = _create_confidence_op(self, tensor1[9]->t, outputs[0]);
    CHECK_PTR_FAIL_GOTO( tensor1[10], "Create internal tensor fail.", final );
    tensor1[11] = _create_reshape_op(self, tensor1[10]->t, outputs[0], 80);
    CHECK_PTR_FAIL_GOTO( tensor1[11], "Create internal tensor fail.", final );
    ret = _create_concat_op(self, tensor0[3]->t, tensor0[7]->t, tensor0[11]->t,
       tensor1[3]->t, tensor1[7]->t, tensor1[11]->t, outputs[0]);
    if (ret == FALSE)
    {
        VSILOGE("Create concat operation fail");
        goto final;
    }
    ret = FALSE;
    /**box**/
    /*
    26x26x255 --> 26x26x4, begin: [0, 0, 0, 0] end: [0, 0, 4, 0] stride: [1, 1, 1,  1]
    */
    tensor0[0] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 3);
    CHECK_PTR_FAIL_GOTO( tensor0[0], "Create internal tensor fail.", final );
    tensor0[1] = _create_box_op(self, tensor0[0]->t, inputs[index_2], outputs[1], 23, 27);
    CHECK_PTR_FAIL_GOTO( tensor0[1], "Create internal tensor fail.", final );
    tensor0[2] = _create_reshape_op(self, tensor0[1]->t, outputs[1], 4);
    CHECK_PTR_FAIL_GOTO( tensor0[2], "Create internal tensor fail.", final );
    /*
    26x26x255 --> 26x26x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1,  1]
    */
    tensor0[3] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 4);
    CHECK_PTR_FAIL_GOTO( tensor0[3], "Create internal tensor fail.", final );
    tensor0[4] = _create_box_op(self, tensor0[3]->t, inputs[index_2], outputs[1], 37, 58);
    CHECK_PTR_FAIL_GOTO( tensor0[4], "Create internal tensor fail.", final );
    tensor0[5] = _create_reshape_op(self, tensor0[4]->t, outputs[1], 4);
    CHECK_PTR_FAIL_GOTO( tensor0[5], "Create internal tensor fail.", final );
    /*
    26x26x255 --> 26x26x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1,  1]
    */
    tensor0[6] = _create_strided_slice_op(self, inputs[index_0], 11, 11, 5);
    CHECK_PTR_FAIL_GOTO( tensor0[6], "Create internal tensor fail.", final );
    tensor0[7] = _create_box_op(self, tensor0[6]->t, inputs[index_2], outputs[1], 81, 82);
    CHECK_PTR_FAIL_GOTO( tensor0[7], "Create internal tensor fail.", final );
    tensor0[8] = _create_reshape_op(self, tensor0[7]->t, outputs[1], 4);
    CHECK_PTR_FAIL_GOTO( tensor0[8], "Create internal tensor fail.", final );
    /*
    13x13x255 --> 13x13x4, begin: [0, 0, 0, 0] end: [0, 0, 4, 0] stride: [1, 1, 1,  1]
    */
    tensor1[0] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 3);
    CHECK_PTR_FAIL_GOTO( tensor1[0], "Create internal tensor fail.", final );
    tensor1[1] = _create_box_op(self, tensor1[0]->t, inputs[index_3], outputs[1], 81, 82);
    CHECK_PTR_FAIL_GOTO( tensor1[1], "Create internal tensor fail.", final );
    tensor1[2] = _create_reshape_op(self, tensor1[1]->t, outputs[1], 4);
    CHECK_PTR_FAIL_GOTO( tensor1[2], "Create internal tensor fail.", final );
    /*
    13x13x255 --> 13x13x4, begin: [0, 0, 85, 0] end: [0, 0, 89, 0] stride: [1, 1, 1,  1]
    */
    tensor1[3] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 4);
    CHECK_PTR_FAIL_GOTO( tensor1[3], "Create internal tensor fail.", final );
    tensor1[4] = _create_box_op(self, tensor1[3]->t, inputs[index_3], outputs[1], 135, 169);
    CHECK_PTR_FAIL_GOTO( tensor1[4], "Create internal tensor fail.", final );
    tensor1[5] = _create_reshape_op(self, tensor1[4]->t, outputs[1], 4);
    CHECK_PTR_FAIL_GOTO( tensor1[5], "Create internal tensor fail.", final );
    /*
    13x13x255 --> 13x13x4, begin: [0, 0, 170, 0] end: [0, 0, 174, 0] stride: [1, 1, 1,  1]
    */
    tensor1[6] = _create_strided_slice_op(self, inputs[index_1], 11, 11, 5);
    CHECK_PTR_FAIL_GOTO( tensor1[6], "Create internal tensor fail.", final );
    tensor1[7] = _create_box_op(self, tensor1[6]->t, inputs[index_3], outputs[1], 344, 319);
    CHECK_PTR_FAIL_GOTO( tensor1[7], "Create internal tensor fail.", final );
    tensor1[8] = _create_reshape_op(self, tensor1[7]->t, outputs[1], 4);
    CHECK_PTR_FAIL_GOTO( tensor1[8], "Create internal tensor fail.", final );
    ret = _create_concat_op(self, tensor0[2]->t, tensor0[5]->t, tensor0[8]->t,
        tensor1[2]->t, tensor1[5]->t, tensor1[8]->t, outputs[1]);
    if (ret == FALSE)
    {
        VSILOGE("Create concat operation fail");
        goto final;
    }
 final:
    return ret;
 } /* op_setup() */
 static vsi_status op_init
    (
    vsi_nn_node_t* self
    )
 {
    int32_t i = 0;
    vsi_nn_custom_tiny_yolov4_postprocess_param *p = &self->nn_param.custom_tiny_yolov4_postprocess;
    p->local = \
        (custom_tiny_yolov4_postprocess_local_data_t*)malloc(sizeof(custom_tiny_yolov4_postprocess_local_data_t));
    CHECK_PTR_FAIL_GOTO(p->local, "create buffer fail", final);
    memset(p->local, 0, sizeof(custom_tiny_yolov4_postprocess_local_data_t));
    for ( i = 0; i < VSI_NN_MAX_DIM_NUM; i++ )
    {
        p->local->stride_dims[i] = 1;
    }
    p->local->begin_dims[0][2] = 4;
    p->local->end_dims[0][2] = 85;
    p->local->begin_dims[1][2] = 89;
    p->local->end_dims[1][2] = 170;
    p->local->begin_dims[2][2] = 174;
    p->local->end_dims[2][2] = 255;
    p->local->begin_dims[3][2] = 0;
    p->local->end_dims[3][2] = 4;
    p->local->begin_dims[4][2] = 85;
    p->local->end_dims[4][2] = 89;
    p->local->begin_dims[5][2] = 170;
    p->local->end_dims[5][2] = 174;
 final:
    return VSI_SUCCESS;
 } /* op_init() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t* self
    )
 {
    vsi_status status = VSI_SUCCESS;
    status = vsi_nn_op_common_deinit(self);
    vsi_nn_safe_free(self->nn_param.custom_tiny_yolov4_postprocess.local);
    vsi_nn_internal_deinit_node_wksp( self );
    return status;
 } /* op_deinit() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ CUSTOM_TINY_YOLOV4_POSTPROCESS,
    /* init       */ op_init,
    /* compute    */ op_compute,
    /* deinit     */ op_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ op_optimize,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c
+++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_box.c
@ -35,9 +35,9 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-typedef struct _bilinear_grid_sample_local_data_t {
+typedef struct _custom_tiny_yolov4_postprocess_box_local_data_t {
    int32_t placeholder;
-} bilinear_grid_sample_local_data_t;
+} custom_tiny_yolov4_postprocess_box_local_data_t;
 /*
 Declare number of input and output.
@ -53,27 +53,25 @@ static vsi_status op_compute
    )
 {
    vsi_status status = VSI_FAILURE;
-
+    vsi_nn_kernel_param_t * param = NULL;
-    vsi_nn_kernel_param_t* param = NULL;
+    float bias_0 = self->nn_param.custom_tiny_yolov4_postprocess_box.bias_0;
-    int32_t align_corners = self->nn_param.bilinear_grid_sample.align_corners;
+    float bias_1 = self->nn_param.custom_tiny_yolov4_postprocess_box.bias_1;
    vsi_nn_kernel_node_t n;
    param = vsi_nn_kernel_param_create();
-    vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
+    vsi_nn_kernel_param_add_float32( param, "bias_0", bias_0 );
-    n = vsi_nn_kernel_selector(
+    vsi_nn_kernel_param_add_float32( param, "bias_1", bias_1 );
-        self->graph, "bilinear_grid_sample", inputs, 2, outputs, 1, param);
+
-    if (n == NULL) {
+    self->n = vsi_nn_kernel_selector( self->graph, "tiny_yolov4_postprocess_box",
-        vsi_nn_kernel_param_release(&param);
+        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
-        status = VSI_FAILURE;
+
-        return status;
+    if ( self->n )
-    }
+    {
    self->n = (vx_node)n;
    vsi_nn_kernel_param_release(&param);
    if (self->n) {
        status = VSI_SUCCESS;
    }
    vsi_nn_kernel_param_release( &param );
    return status;
 } /* op_compute() */
@ -85,6 +83,9 @@ static vsi_bool op_check
    )
 {
    /*TODO: Check tensor shapes. */
    VSI_UNREFERENCED(self);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return TRUE;
 } /* op_check() */
@ -95,61 +96,36 @@ static vsi_bool op_setup
    vsi_nn_tensor_t ** outputs
    )
 {
-    if (NULL == self) {
+    uint32_t rank = inputs[0]->attr.dim_num;
-        return FALSE;
+    vsi_bool ret = TRUE;
    }
-    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
+    VSI_UNREFERENCED(self);
-        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+
-        outputs[0]->attr.size[0] = inputs[1]->attr.size[1];
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-        outputs[0]->attr.size[1] = inputs[1]->attr.size[2];
+    {
-        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+        outputs[0]->attr.dim_num = rank;
-        if (4 == inputs[0]->attr.dim_num) {
+        outputs[0]->attr.size[0] = inputs[0]->attr.size[2];
-            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+        outputs[0]->attr.size[1] = inputs[0]->attr.size[0];
        outputs[0]->attr.size[2] = inputs[0]->attr.size[1];
        if (rank > 3)
        {
            memcpy( &outputs[0]->attr.size[3], &inputs[0]->attr.size[3], (rank - 3) * sizeof(vsi_size_t) );
        }
    }
-    return TRUE;
+    return ret;
 } /* op_setup() */
 static vsi_status op_init
    (
    vsi_nn_node_t* self
    )
 {
    /* TODO
    //self->nn_param.bilinear_grid_sample.local = \
    //    (bilinear_grid_sample_local_data_t*)malloc(sizeof(bilinear_grid_sample_local_data_t));
    */
    return VSI_SUCCESS;
 } /* op_init() */
 static vsi_status op_deinit
    (
    vsi_nn_node_t* self
    )
 {
    vsi_status status = VSI_SUCCESS;
    status = vsi_nn_op_common_deinit(self);
    /* TODO
    //vsi_nn_safe_free(self->nn_param.bilinear_grid_sample.local);
    */
    return status;
 } /* op_deinit() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
-    /* op_name    */ BILINEAR_GRID_SAMPLE,
+    /* op_name    */ CUSTOM_TINY_YOLOV4_POSTPROCESS_BOX,
-    /* init       */ op_init,
+    /* init       */ NULL,
    /* compute    */ op_compute,
-    /* deinit     */ op_deinit,
+    /* deinit     */ vsi_nn_op_common_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ NULL,
--- a/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c
+++ b/src/tim/vx/internal/src/custom/ops/custom_tiny_yolov4_postprocess_confidence.c
@ -0,0 +1,127 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <string.h>
 #include <stdlib.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 typedef struct _tiny_yolov4_postprocess_confidence_local_data_t {
    int32_t placeholder;
 } tiny_yolov4_postprocess_confidence_local_data_t;
 /*
 Declare number of input and output.
 */
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    vsi_status status = VSI_FAILURE;
    self->n = vsi_nn_kernel_selector( self->graph, "tiny_yolov4_postprocess_confidence",
        inputs, 1, outputs, 1, NULL );
    if ( self->n )
    {
        status = VSI_SUCCESS;
    }
    return status;
 } /* op_compute() */
 static vsi_bool op_check
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    /*TODO: Check tensor shapes. */
    VSI_UNREFERENCED(self);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return TRUE;
 } /* op_check() */
 static vsi_bool op_setup
    (
    vsi_nn_node_t * self,
    vsi_nn_tensor_t ** inputs,
    vsi_nn_tensor_t ** outputs
    )
 {
    uint32_t rank = inputs[0]->attr.dim_num;
    vsi_bool ret = TRUE;
    VSI_UNREFERENCED(self);
    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = rank;
        outputs[0]->attr.size[0] = inputs[0]->attr.size[2] - 1;
        outputs[0]->attr.size[1] = inputs[0]->attr.size[0];
        outputs[0]->attr.size[2] = inputs[0]->attr.size[1];
        if (rank > 3)
        {
            memcpy( &outputs[0]->attr.size[3], &inputs[0]->attr.size[3], (rank - 3) * sizeof(vsi_size_t) );
        }
    }
    return ret;
 } /* op_setup() */
 __BEGIN_DECLS
 /* Registrar */
 DEF_OP_REG
    (
    /* op_name    */ CUSTOM_TINY_YOLOV4_POSTPROCESS_CONFIDENCE,
    /* init       */ NULL,
    /* compute    */ op_compute,
    /* deinit     */ vsi_nn_op_common_deinit,
    /* check      */ op_check,
    /* setup      */ op_setup,
    /* optimize   */ NULL,
    /* input_num  */ _INPUT_NUM,
    /* output_num */ _OUTPUT_NUM
    );
 __END_DECLS
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_sample_cpu.c
@ -54,20 +54,26 @@ DEF_KERNEL_EXECUTOR(_softmax_compute)
    size_t param_size
    )
 {
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
    float *buffer[_CPU_IO_NUM] = {NULL};
    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = {NULL};
    vsi_nn_kernel_tensor_attr_t *attr[_CPU_IO_NUM] = {NULL};
    uint32_t i = 0, out_elements = 0;
    int32_t axis;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; // input0
    tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; // input1
    tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; // output
    attr[0] = vsi_nn_kernel_tensor_attr_create(tensors[0]);
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create(tensors[1]);
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    attr[2] = vsi_nn_kernel_tensor_attr_create(tensors[2]);
    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
    CHECK_STATUS_FAIL_GOTO(status, final );
@ -133,6 +139,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
    return VSI_SUCCESS;
 }
@ -153,6 +161,9 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_t node = NULL;
    int32_t axis = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    axis = vsi_nn_kernel_param_get_int32(params, "axis");
    status = _query_kernel(inputs, outputs, kernel);
    if(status != VSI_SUCCESS)
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
@ -54,7 +54,7 @@ DEF_KERNEL_EXECUTOR(_softmax_exec)
    size_t param_size
    )
 {
-    vsi_status status = VX_SUCCESS;
+    vsi_status status = VSI_FAILURE;
    float* buffer[_CPU_IO_NUM] = { NULL };
    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
    vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
@ -64,11 +64,16 @@ DEF_KERNEL_EXECUTOR(_softmax_exec)
    float fMax = 0.0;
    float fProbSum = 0.0f;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &sf_axis);
    CHECK_STATUS_FAIL_GOTO(status, final );
@ -141,6 +146,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
    return VSI_SUCCESS;
 }
@ -161,6 +168,9 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_t node = NULL;
    int32_t axis = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    axis = vsi_nn_kernel_param_get_int32(params, "axis");
    status = _query_kernel( inputs, outputs, kernel );
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_affine_cpu.c
@ -62,6 +62,7 @@ static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _CUSTOM_WARP_AFFINE_PARAM_NUM  _cnt_of_array( _custom_warp_affine_kernel_param_def )
@ -97,7 +98,7 @@ static vsi_bool _read_pixel
    if (out_of_bounds)
    {
-        *pixel = 205.0f;
+        *pixel = 0.0f;
        return TRUE;
    }
@ -125,6 +126,7 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
    vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
    int32_t type = 0;
    int32_t rgb_type = 0;
    float matrix[6] = {0};
    vsi_size_t i = 0;
    vsi_size_t b = 0;
@ -135,11 +137,16 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_size_t height = 0;
    vsi_size_t outer_size = 1;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
@ -153,6 +160,7 @@ DEF_KERNEL_EXECUTOR(_compute)
    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
        &type);
    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rgb_type);
    CHECK_STATUS_FAIL_GOTO(status, final );
    for (i = 0; i < 6; i++)
    {
@ -172,34 +180,95 @@ DEF_KERNEL_EXECUTOR(_compute)
    {
        float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
        float *dst_base = buffer[1] + b * width * height;
-        for (y = 0; y < height; y++)
+
        if ( rgb_type == VSI_NN_WARP_AFFINE_TYPE_RGB )
        {
-            for (x = 0; x < width; x++)
+            width = width / 3;
            for (y = 0; y < height; y++)
            {
-                float xf = 0;
+                for (x = 0; x < width; x++)
                float yf = 0;
                float dst = 0;
                _transform_affine(x, y, matrix, &xf, &yf);
                if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
                {
-                    _read_pixel(src_base, attr[0], xf, yf, &dst);
+                    float xf = 0;
-                    dst_base[y * width + x] = dst;
+                    float yf = 0;
                    float dst = 0;
                    _transform_affine(x, y, matrix, &xf, &yf);
                    if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
                    {
                        _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf), &dst);
                        dst_base[y * 3 * width + 3 * x] = dst;
                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf), &dst);
                        dst_base[y * 3 * width + 3 * x + 1] = dst;
                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf), &dst);
                        dst_base[y * 3 * width + 3 * x + 2] = dst;
                    }
                    else
                    {
                        float tl = 0, tr = 0, bl = 0, br = 0;
                        float ar = xf - floorf(xf);
                        float ab = yf - floorf(yf);
                        float al = 1.0f - ar;
                        float at = 1.0f - ab;
                        _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf), &tl);
                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1), floorf(yf), &tr);
                        _read_pixel(src_base, attr[0], 3 * floorf(xf), floorf(yf) + 1, &bl);
                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1), floorf(yf) + 1, &br);
                        dst_base[y * 3 * width + 3 * x] =
                            tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf), &tl);
                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 1, floorf(yf), &tr);
                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 1, floorf(yf) + 1, &bl);
                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 1, floorf(yf) + 1, &br);
                        dst_base[y * 3 * width + 3 * x + 1] =
                            tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf), &tl);
                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 2, floorf(yf), &tr);
                        _read_pixel(src_base, attr[0], 3 * floorf(xf) + 2, floorf(yf) + 1, &bl);
                        _read_pixel(src_base, attr[0], 3 * (floorf(xf) + 1) + 2, floorf(yf) + 1, &br);
                        dst_base[y * 3 * width + 3 * x + 2] =
                            tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
                    }
                }
-                else
+            }
        }
        else
        {
            for (y = 0; y < height; y++)
            {
                for (x = 0; x < width; x++)
                {
-                    float tl = 0, tr = 0, bl = 0, br = 0;
+                    float xf = 0;
-                    float ar = xf - floorf(xf);
+                    float yf = 0;
-                    float ab = yf - floorf(yf);
+                    float dst = 0;
                    float al = 1.0f - ar;
                    float at = 1.0f - ab;
-                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
+                    _transform_affine(x, y, matrix, &xf, &yf);
-                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
+                    if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
-                    _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
+                    {
-                    _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
+                        _read_pixel(src_base, attr[0], xf, yf, &dst);
                        dst_base[y * width + x] = dst;
                    }
                    else
                    {
                        float tl = 0, tr = 0, bl = 0, br = 0;
                        float ar = xf - floorf(xf);
                        float ab = yf - floorf(yf);
                        float al = 1.0f - ar;
                        float at = 1.0f - ab;
-                    dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
+                        _read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
                        _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
                        _read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
                        _read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
                        dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
                    }
                }
            }
        }
@ -233,6 +302,8 @@ static vsi_status _query_kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
    kernel->info.function    = _compute;
    kernel->info.parameters  = _custom_warp_affine_kernel_param_def;
@ -260,6 +331,7 @@ static vsi_nn_kernel_node_t _setup
    size_t i = 0;
    size_t buffer_size = 0;
    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
    int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type");
    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
@ -278,6 +350,8 @@ static vsi_nn_kernel_node_t _setup
                node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
                        graph, F32, &buffer[i] );
            }
            node_params[9] = vsi_nn_kernel_scalar_create(
                graph, I32, &rgb_type );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
@ -286,6 +360,7 @@ static vsi_nn_kernel_node_t _setup
            {
                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
            }
            vsi_nn_kernel_scalar_release( &node_params[9] );
        }
    }
    return node;
--- a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_warp_perspective_cpu.c
@ -95,7 +95,7 @@ static vsi_bool _read_pixel
    )
 {
    vsi_size_t width = attr->shape->data[0];
-    vsi_size_t height = attr->shape->data[1];
+    vsi_size_t height = attr->shape->size > 1 ? attr->shape->data[1] : 1;
    vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height);
    vsi_size_t bx = 0, by = 0;
@ -139,11 +139,16 @@ DEF_KERNEL_EXECUTOR(_compute)
    vsi_size_t height = 0;
    vsi_size_t outer_size = 1;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
@ -237,6 +242,8 @@ static vsi_status _query_kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
    kernel->info.function    = _compute;
    kernel->info.parameters  = _custom_warp_perspective_kernel_param_def;
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
@ -73,6 +73,8 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer)
        {0, 0, 0},  // local_size: local group size in thread
        {0, 0, 0}}; // global_size: image size in thread
    VSI_UNREFERENCED(param_size);
    attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    if (!attr)
    {
@ -144,6 +146,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t* kernel
    )
 {
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
    vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
@ -170,6 +174,9 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_t node = NULL;
    int32_t axis = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    axis = vsi_nn_kernel_param_get_int32(params, "axis");
    status = _query_kernel( inputs, outputs, kernel );
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_box_evis.c
@ -0,0 +1,357 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
 typedef enum
 {
    INTERNAL_KERNEL_TINY_YOLOV4_POSTPROCESS_BOX,
 } _internal_kernel_e;
 #define _SOURCE         "tiny_yolov4_postprocess_box"
 #define _KERNEL_NAME    CVIVANTE_NAMESPACE("evis.tiny_yolov4_postprocess_box_U8_U8toU8")
 // Add kernel hashtable here
 #define TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
        (( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 ))
 #define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
        { TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), \
        _KERNEL_NAME, _SOURCE }
 typedef struct
 {
    uint32_t key;
    char * function_name;
    const char * source_name;
 } _kernel_map_type;
 static const _kernel_map_type _tiny_yolov4_postprocess_box_kernel_map[] =
 {
    // Register kernel here
    PACK_KERNEL_MAP( U8, U8, U8 ),
 };
 /*
 * Kernel params
 */
 static vx_param_description_t _tiny_yolov4_postprocess_box_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM  _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_param_def )
 #define SCALAR_BIAS_0_VALUE          (3)
 #define SCALAR_BIAS_1_VALUE          (4)
 /*
 * Kernel initializer
 */
 DEF_KERNEL_INITIALIZER(_tiny_yolov4_postprocess_box_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    gpu_param_t gpu_param = {
        3,
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
    float CONST2 = 16.0f;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    // Add initializer
    gpu_param.dim = 2;
    gpu_param.global_scale[0] = 4;
    gpu_param.global_scale[1] = 1;
    gpu_param.global_size[0] = gpu_align_p2(
            (attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1)
            / gpu_param.global_scale[0], 8);
    gpu_param.global_size[1] = 1;
    if (attr[0]->shape->data[0] == 13 * 13)
    {
        CONST2 = 32.0f;
    }
    if (attr[0]->dtype == U8 && attr[1]->dtype == U8 && attr[2]->dtype == U8)
    {
        float input0_scale = attr[0]->scale;
        float input0_tail = 0 - (float)attr[0]->zero_point * input0_scale;
        float input1_scale = attr[1]->scale;
        float input1_tail = 0 - (float)attr[1]->zero_point * input1_scale;
        float output_scale = 1.0f / attr[2]->scale;
        float output_zp = (float)attr[2]->zero_point;
        gpu_dp_inst_t uniExtract8Data_2x8 = {{
            0x33333333, // TCfg
            0x11110000, // ASelt
            0x03020100, 0x03020100, // ABin
            0x00000000, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00002400, // AccumType, ConstantType, and PostShift
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniDatatoFloat32_0_4x4 = {{
            0x01010101, // TCfg
            0x00000000, // ASelt
            0x00010000, 0x00030002, // ABin
            0x02020202, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000000, 0x00000001, 0x00000000,
            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniDatatoFloat32_1_4x4 = {{
            0x01010101, // TCfg
            0x00000000, // ASelt
            0x00050004, 0x00070006, // ABin
            0x02020202, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000000, 0x00000001, 0x00000000,
            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniDataTranspose_0_2x8 = {{
            0x11111111, // TCfg
            0x00000000, // ASelt
            0x0c080400, 0x0d090501, // ABin
            0x22222222, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000001, 0x00000001, 0x00000001,
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniDataTranspose_1_2x8 = {{
            0x11111111, // TCfg
            0x00000000, // ASelt
            0x0e0a0602, 0x0f0b0703, // ABin
            0x22222222, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000001, 0x00000001, 0x00000001,
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16};
        status  = vsi_nn_kernel_gpu_add_param( node, "uniDatatoFloat32_0_4x4", &uniDatatoFloat32_0_4x4);
        status |= vsi_nn_kernel_gpu_add_param( node, "uniDatatoFloat32_1_4x4", &uniDatatoFloat32_1_4x4);
        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Data_2x8", &uniExtract8Data_2x8);
        status |= vsi_nn_kernel_gpu_add_param( node, "uniDataTranspose_0_2x8", &uniDataTranspose_0_2x8);
        status |= vsi_nn_kernel_gpu_add_param( node, "uniDataTranspose_1_2x8", &uniDataTranspose_1_2x8);
        status |= vsi_nn_kernel_gpu_add_param( node, "input0_scale", &input0_scale);
        status |= vsi_nn_kernel_gpu_add_param( node, "input0_tail", &input0_tail);
        status |= vsi_nn_kernel_gpu_add_param( node, "input1_scale", &input1_scale);
        status |= vsi_nn_kernel_gpu_add_param( node, "input1_tail", &input1_tail);
        status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
        status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
        status |= vsi_nn_kernel_gpu_add_param( node, "CONST2", &CONST2);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
 final:
    if (attr[0])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[0] );
    }
    if (attr[1])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[1] );
    }
    if (attr[2])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[2] );
    }
    return status;
 } /* _tiny_yolov4_postprocess_box_initializer() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs
    /* Add extra params */
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_dtype_e in0_dtype;
    vsi_nn_kernel_dtype_e in1_dtype;
    vsi_nn_kernel_dtype_e out_dtype;
    const _kernel_map_type * kernel_map = _tiny_yolov4_postprocess_box_kernel_map;
    size_t kernel_map_size              = _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_map );
    vx_param_description_t * param_def  = _tiny_yolov4_postprocess_box_kernel_param_def;
    vx_kernel_initialize_f  initializer = _tiny_yolov4_postprocess_box_initializer;
    uint32_t key;
    uint32_t i;
    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    in1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    key = TINY_YOLOV4_POSTPROCESS_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype );
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = _cnt_of_array( _tiny_yolov4_postprocess_box_kernel_param_def );
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                "vsi_nn_kernel_header",
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
        status = VSI_SUCCESS;
    }
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM];
    vsi_nn_kernel_node_t node = NULL;
    vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { 0 };
    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
    float bias_0 = vsi_nn_kernel_param_get_float32( params, "bias_0" );
    float bias_1 = vsi_nn_kernel_param_get_float32( params, "bias_1" );
    VSI_UNREFERENCED(params);
    memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
    shape[0][0] = shape[0][0] * shape[0][1];
    shape[0][1] = shape[0][2];
    shape[0][2] = 1;
    memcpy(shape[1], inputs[1]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
    shape[1][0] = shape[1][0] * shape[1][1];
    shape[1][1] = shape[1][2];
    shape[1][2] = 1;
    memcpy(shape[2], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
    shape[2][0] = shape[2][0];
    shape[2][1] = shape[2][2] * shape[2][1];
    shape[2][2] = 1;
    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
            inputs[0], shape[0], inputs[0]->attr.dim_num );
    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
            inputs[1], shape[1], inputs[1]->attr.dim_num );
    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
            outputs[0], shape[2], outputs[0]->attr.dim_num );
    if ( !vsi_nn_kernel_gpu_check_shape(
        reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) )
    {
        return NULL;
    }
    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM,
                    reshape_tensors, input_num, &reshape_tensors[2], output_num );
            /* Pass parameters to node. */
            node_params[SCALAR_BIAS_0_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &bias_0 );
            node_params[SCALAR_BIAS_1_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &bias_1 );
            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TINY_YOLOV4_POSTPROCESS_BOX_PARAM_NUM );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_BIAS_0_VALUE] );
            vsi_nn_kernel_scalar_release( &node_params[SCALAR_BIAS_1_VALUE] );
        }
    }
    vsi_safe_release_tensor( reshape_tensors[0] );
    vsi_safe_release_tensor( reshape_tensors[1] );
    vsi_safe_release_tensor( reshape_tensors[2] );
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_EVIS( tiny_yolov4_postprocess_box, _setup )
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
@ -0,0 +1,320 @@
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
 *    to deal in the Software without restriction, including without limitation
 *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *    and/or sell copies of the Software, and to permit persons to whom the
 *    Software is furnished to do so, subject to the following conditions:
 *
 *    The above copyright notice and this permission notice shall be included in
 *    all copies or substantial portions of the Software.
 *
 *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include "vsi_nn_types.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 __BEGIN_DECLS
 /*
 * Define kernel meta.
 */
 typedef enum
 {
    INTERNAL_KERNEL_TINY_YOLOV4_POSTPROCESS_CONFIDENCE,
 } _internal_kernel_e;
 #define _SOURCE         "tiny_yolov4_postprocess_confidence"
 #define _KERNEL_NAME    CVIVANTE_NAMESPACE("evis.tiny_yolov4_postprocess_conf_U8toU8")
 // Add kernel hashtable here
 #define _CONFIDENCE_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
        { _CONFIDENCE_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
         _KERNEL_NAME, _SOURCE }
 typedef struct
 {
    uint32_t key;
    char * function_name;
    const char * source_name;
 } _kernel_map_type;
 static const _kernel_map_type _tiny_yolov4_postprocess_confidence_kernel_map[] =
 {
    // Register kernel here
    PACK_KERNEL_MAP( U8, U8 ),
 };
 /*
 * Kernel params
 */
 static vx_param_description_t _tiny_yolov4_postprocess_confidence_kernel_param_def[] =
 {
    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    // Add kererl parameters here
 };
 #define _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM  \
    _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_param_def )
 /*
 * Kernel initializer
 */
 DEF_KERNEL_INITIALIZER(_tiny_yolov4_postprocess_confidence_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    gpu_param_t gpu_param = {
        3,
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    gpu_param.dim = 2;
    gpu_param.global_scale[0] = 4;
    gpu_param.global_scale[1] = 4;
    gpu_param.global_size[0] = gpu_align_p2(
            (attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1)
            / gpu_param.global_scale[0], 4);
    gpu_param.global_size[1] = (
            (attr[1]->shape->data[0] + gpu_param.global_scale[1] - 1)
            / gpu_param.global_scale[1]);
    if (attr[0]->dtype == U8 && attr[1]->dtype == U8)
    {
        float output_scale = attr[0]->scale * attr[0]->scale / attr[1]->scale;
        int output_zp = attr[1]->zero_point;
        uint16_t   M0                 = 0;
        int32_t    postShift          = 0;
        int32_t i = 0;
        gpu_dp_inst_t uniU8TimesU8_0_4x4 = {{
            0x01010101, // TCfg
            0x00000000, // ASelt
            0x00010000, 0x00030002, // ABin
            0x01010101, // BSelt
            0x00010000, 0x00030002, // BBin
            0x00000400, // AccumType, ConstantType, and PostShift
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniU16TimesMultiplier_PostShift_2x8 = {{
            0x11111111, // TCfg
            0x00000000, // ASelt
            0x03020100, 0x07060504, // ABin
            0x22222222, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00000600, // AccumType, ConstantType, and PostShift
            0x00000001, 0x00000001, 0x00000001, 0x00000001,
            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniU8PlusU8_trans_0_2x8 = {{
            0xffffffff, // TCfg
            0x44444444, // ASelt
            0x0c080400, 0x0d090501, // ABin
            0x00000000, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00007400, // AccumType, ConstantType, and PostShift
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_dp_inst_t uniU8PlusU8_trans_1_2x8 = {{
            0xffffffff, // TCfg
            0x44444444, // ASelt
            0x0e0a0602, 0x0f0b0703, // ABin
            0x00000000, // BSelt
            0x00000000, 0x00000000, // BBin
            0x00007400, // AccumType, ConstantType, and PostShift
            0x00000000, 0x00000000, 0x00000000, 0x00000000,
            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
        }, GPU_DP_TYPE_16};
        gpu_quantize_multiplier_16bit((double)output_scale, &M0, &postShift);
        uniU16TimesMultiplier_PostShift_2x8.data[7] |= (postShift & 0x1F);
        for ( i = 8; i < 16; i++ )
        {
            uniU16TimesMultiplier_PostShift_2x8.data[i] = M0;
        }
        status  = vsi_nn_kernel_gpu_add_param( node, "uniU8TimesU8_0_4x4", &uniU8TimesU8_0_4x4);
        status |= vsi_nn_kernel_gpu_add_param( node, "uniU16TimesMultiplier_PostShift_2x8",
            &uniU16TimesMultiplier_PostShift_2x8);
        status |= vsi_nn_kernel_gpu_add_param( node, "uniU8PlusU8_trans_0_2x8", &uniU8PlusU8_trans_0_2x8);
        status |= vsi_nn_kernel_gpu_add_param( node, "uniU8PlusU8_trans_1_2x8", &uniU8PlusU8_trans_1_2x8);
        status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
 final:
    if (attr[0])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[0] );
    }
    if (attr[1])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[1] );
    }
    return status;
 } /* _tiny_yolov4_postprocess_confidence_initializer() */
 /*
 * Query kernel
 */
 static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs
    /* Add extra params */
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_dtype_e in_dtype;
    vsi_nn_kernel_dtype_e out_dtype;
    const _kernel_map_type * kernel_map = _tiny_yolov4_postprocess_confidence_kernel_map;
    size_t kernel_map_size              = _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_map );
    vx_param_description_t * param_def  = _tiny_yolov4_postprocess_confidence_kernel_param_def;
    vx_kernel_initialize_f  initializer = _tiny_yolov4_postprocess_confidence_initializer;
    uint32_t key;
    uint32_t i;
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    key = _CONFIDENCE_HASH_KEY( in_dtype, out_dtype );
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if ( kernel_map[i].key == key )
        {
            break;
        }
    }
    if ( i < (uint32_t)kernel_map_size )
    {
        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
        kernel->info.parameters  = param_def;
        kernel->info.numParams   = _cnt_of_array( _tiny_yolov4_postprocess_confidence_kernel_param_def );
        kernel->info.initialize  = initializer;
        // Register code source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
                kernel_map[i].source_name );
        status = VSI_SUCCESS;
    }
    return status;
 } /* _query_kernel() */
 static vsi_nn_kernel_node_t _setup
    (
    vsi_nn_graph_t              * graph,
    vsi_nn_tensor_t            ** inputs,
    size_t                        input_num,
    vsi_nn_tensor_t            ** outputs,
    size_t                        output_num,
    const vsi_nn_kernel_param_t * params,
    vsi_nn_kernel_t             * kernel
    )
 {
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM];
    vsi_nn_kernel_node_t node = NULL;
    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { 0 };
    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
    VSI_UNREFERENCED(params);
    memcpy(shape[0], inputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
    shape[0][0] = shape[0][0] * shape[0][1];
    shape[0][1] = shape[0][2];
    shape[0][2] = 1;
    memcpy(shape[1], outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
    shape[1][0] = shape[1][0];
    shape[1][1] = shape[1][2] * shape[1][1];
    shape[1][2] = 1;
    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
            inputs[0], shape[0], inputs[0]->attr.dim_num );
    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
            outputs[0], shape[1], outputs[0]->attr.dim_num );
    if ( !vsi_nn_kernel_gpu_check_shape(
        reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) )
    {
        return NULL;
    }
    status = _query_kernel( kernel, inputs, outputs );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
        if ( node )
        {
            /* Set inputs and outputs */
            vsi_nn_kernel_node_pack_io( node_params, _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM,
                    reshape_tensors, input_num, &reshape_tensors[1], output_num );
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params,
                _TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM );
        }
    }
    vsi_safe_release_tensor(reshape_tensors[0]);
    vsi_safe_release_tensor(reshape_tensors[1]);
    return node;
 } /* _setup() */
 __END_DECLS
 REGISTER_BACKEND_EVIS( tiny_yolov4_postprocess_confidence, _setup )
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_affine_evis.c
@ -50,18 +50,27 @@ typedef enum _custom_warp_affine_type_e
 }custom_warp_affine_type_e;
 #define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE      "custom_warp_affine"
 #define _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE      "custom_warp_affine_rgb"
 // Add kernel hashtable here
-#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \
+#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D, RGB_TYPE ) \
-        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20))
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20) | (RGB_TYPE << 24))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
-        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 0 ), \
          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
          _CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
 #define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
-        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \
+        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 0 ), \
          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
          _CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
 #define PACK_RGB_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0, 1 ), \
          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb"), \
          _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
 #define PACK_RGB_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
        { CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1, 1 ), \
          CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_rgb_2D"), \
          _CUSTOM_WARP_AFFINE_RGB_KERNEL_SOURCE }
 typedef struct
 {
@ -78,6 +87,12 @@ static const _kernel_map_type _custom_warp_affine_kernel_map[] =
    PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
    PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
    PACK_RGB_KERNEL_MAP( U8, U8, nearest_neighbor ),
    PACK_RGB_KERNEL_MAP( U8, U8, bilinear ),
    PACK_RGB_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
    PACK_RGB_2D_KERNEL_MAP( U8, U8, bilinear ),
 };
 /*
@ -124,6 +139,8 @@ DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
    float matrix4[4] = {0};
    int32_t i = 0;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -178,7 +195,81 @@ final:
    return status;
 } /* _custom_warp_affine_initializer() */
 DEF_KERNEL_INITIALIZER(_custom_warp_affine_rgb_initializer)
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
    size_t                              param_size
    )
 {
    vsi_status status = VSI_FAILURE;
    gpu_param_t gpu_param = {
        3,
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0},
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
    vsi_size_array_t * out_shape = NULL;
    float m[6] = {0};
    float matrix0[4] = {0};
    float matrix1[4] = {0};
    int32_t i = 0;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
    for (i = 0; i < 6; i++)
    {
        status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
            &m[i]);
        CHECK_STATUS_FAIL_GOTO(status, final );
    }
    matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[2]; matrix0[3] = m[3];
    matrix1[0] = m[4]; matrix1[1] = m[5];
    out_shape  = attr[1]->shape;
    gpu_param.global_scale[0] = 2;
    gpu_param.global_scale[1] = 1;
    gpu_param.global_scale[2] = 1;
    gpu_param.global_size[0] = (
            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
            / (3 * gpu_param.global_scale[0]));
    gpu_param.global_size[1] = (
            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
            / gpu_param.global_scale[1]);
    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
    status = vsi_nn_kernel_gpu_add_param( node,
        "matrix0", &matrix0 );
    status |= vsi_nn_kernel_gpu_add_param( node,
        "matrix1", &matrix1 );
    CHECK_STATUS_FAIL_GOTO(status, final );
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
 final:
    if (attr[0])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[0] );
        attr[0] = NULL;
    }
    if (attr[1])
    {
        vsi_nn_kernel_tensor_attr_release( &attr[1] );
        attr[1] = NULL;
    }
    return status;
 } /* _custom_warp_affine_rgb_initializer() */
 /*
 * Query kernel
@ -188,7 +279,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    int32_t type
+    int32_t type,
    int32_t rgb_type
    )
 {
    vsi_status status = VSI_FAILURE;
@ -205,8 +297,11 @@ static vsi_status _query_kernel
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img );
+    key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img, rgb_type );
-
+    if (rgb_type == 1)
    {
        initializer = _custom_warp_affine_rgb_initializer;
    }
    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
        if ( kernel_map[i].key == key )
@ -251,6 +346,7 @@ static vsi_nn_kernel_node_t _setup
    size_t i = 0;
    size_t buffer_size = 0;
    int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
    int32_t rgb_type = vsi_nn_kernel_param_get_int32( params, "rgb_type");
    float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
    if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
@ -258,7 +354,7 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }
-    status = _query_kernel( kernel, inputs, outputs, type );
+    status = _query_kernel( kernel, inputs, outputs, type, rgb_type );
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
@ -282,7 +378,7 @@ static vsi_nn_kernel_node_t _setup
                vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
            }
            // Set default border mode.
-            border.constant_value.U32 = 0xcdcdcdcd;
+            border.constant_value.U32 = 0x00000000;
            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
            CHECK_STATUS(status);
        }
--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_warp_perspective_evis.c
@ -127,6 +127,8 @@ DEF_KERNEL_INITIALIZER(_custom_warp_perspective_initializer)
    float matrix4[4] = {0};
    int32_t i = 0;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
--- a/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_ainr_denoise_postprocess.c
@ -48,6 +48,9 @@ static vsi_status op_compute
 {
    vsi_status status = VSI_SUCCESS;
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
 #if defined(VX_DENOISE_POSTPROCESS_SUPPORT) && VX_DENOISE_POSTPROCESS_SUPPORT
    self->n = vxDenoisePostProcesslayer(
        self->graph->g,
@ -83,6 +86,9 @@ static vsi_bool op_check
    vsi_nn_tensor_t ** outputs
    )
 {
    VSI_UNREFERENCED(self);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return TRUE;
 } /* op_check() */
@ -93,6 +99,9 @@ static vsi_bool op_setup
    vsi_nn_tensor_t ** outputs
    )
 {
    VSI_UNREFERENCED(self);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return TRUE;
 } /* op_setup() */
@ -101,6 +110,7 @@ static vsi_status op_init
    vsi_nn_node_t* self
    )
 {
    VSI_UNREFERENCED(self);
    return VSI_SUCCESS;
 } /* op_init() */
--- a/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_sample.c
@ -63,6 +63,9 @@ static vsi_bool op_check
    )
 {
    /*TODO: Check params. */
    VSI_UNREFERENCED(self);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return TRUE;
 } /* op_check() */
@ -73,6 +76,7 @@ static vsi_bool op_setup
    vsi_nn_tensor_t ** outputs
    )
 {
    VSI_UNREFERENCED(node);
    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
    {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
--- a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
@ -62,6 +62,9 @@ static vsi_bool op_check
    )
 {
    /*TODO: Check params. */
    VSI_UNREFERENCED(self);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return TRUE;
 } /* op_check() */
@ -72,6 +75,7 @@ static vsi_bool op_setup
    vsi_nn_tensor_t ** outputs
    )
 {
    VSI_UNREFERENCED(node);
    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
    {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
@ -59,6 +59,7 @@ static vsi_status op_compute
    param = vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
    vsi_nn_kernel_param_add_int32( param, "type", p->type);
    vsi_nn_kernel_param_add_int32( param, "rgb_type", p->rgb_type);
    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
            "custom_warp_affine",
@ -78,6 +79,9 @@ static vsi_bool op_check
    )
 {
    /*TODO: Check tensor shapes. */
    VSI_UNREFERENCED(self);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return TRUE;
 } /* op_check() */
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_perspective.c
@ -78,6 +78,9 @@ static vsi_bool op_check
    )
 {
    /*TODO: Check tensor shapes. */
    VSI_UNREFERENCED(self);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);
    return TRUE;
 } /* op_check() */
--- a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
@ -100,7 +100,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
    size_t                              param_size
    )
 {
-    vsi_status status = VX_FAILURE;
+    vsi_status status = VSI_FAILURE;
    // Alignment with a power of two value.
    gpu_param_t gpu_param = {
        2,
@ -113,6 +113,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
    vsi_nn_kernel_tensor_attr_t *input0_attr = NULL;
    vsi_size_array_t             *input_shape = NULL;
    VSI_UNREFERENCED(param_size);
    input0_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0);
    CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
    input_shape   = input0_attr->shape;
--- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
@ -143,6 +143,8 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
    vsi_size_array_t * out_shape = NULL;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -183,7 +185,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_dtype_e output_dtype;
    vsi_status status = VSI_FAILURE;
    uint32_t key;
-    int32_t i;
+    size_t i;
    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -240,6 +242,9 @@ static vsi_nn_kernel_node_t _setup
    int32_t axis = 0;
    vsi_size_t axis_size = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    axis = vsi_nn_kernel_param_get_int32(params, "axis");
    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
--- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
@ -143,6 +143,8 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
    vsi_size_array_t * out_shape = NULL;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -183,7 +185,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_dtype_e output_dtype;
    vsi_status status = VSI_FAILURE;
    uint32_t key;
-    int32_t i;
+    size_t i;
    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -240,6 +242,9 @@ static vsi_nn_kernel_node_t _setup
    int32_t axis = 0;
    size_t axis_size = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    axis = vsi_nn_kernel_param_get_int32(params, "axis");
    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
--- a/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/avg_pool3d_cl.c
@ -129,6 +129,8 @@ DEF_KERNEL_INITIALIZER(_avg_pool3d_initializer)
    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
    vsi_size_array_t            *output_shape = NULL;
    VSI_UNREFERENCED(param_size);
    vxReadScalarValue(depth_out, &depth_out_value);
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
--- a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
@ -135,6 +135,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
    vsi_size_array_t * in_shape = NULL;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@ -170,7 +172,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_dtype_e output_dtype;
    vsi_status status = VSI_FAILURE;
    uint32_t key;
-    int i;
+    size_t i;
    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -238,6 +240,9 @@ static vsi_nn_kernel_node_t _setup
    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
    float eps = vsi_nn_kernel_param_get_float32(params, "eps");
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const)
        || ( inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16
          && inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 )
--- a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
    vsi_nn_kernel_tensor_attr_t* output_attr = NULL;
    vsi_size_array_t* out_shape = NULL;
    VSI_UNREFERENCED(param_size);
    output_attr =
        vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
    CHECK_PTR_FAIL_GOTO(output_attr, "Create tensor attr buffer fail.", final);
@ -140,9 +142,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
    gpu_param.dim = 2;
    gpu_param.global_size[0] =
-        gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) /
+        (out_shape->data[0] + gpu_param.global_scale[0] - 1) /
-                         gpu_param.global_scale[0],
+                         gpu_param.global_scale[0];
                     4);
    gpu_param.global_size[1] =
        ((out_shape->data[1] + gpu_param.global_scale[1] - 1) /
         gpu_param.global_scale[1]);
--- a/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
@ -134,6 +134,8 @@ DEF_KERNEL_INITIALIZER(_bucketize_initializer)
    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
    vsi_size_array_t * out_shape                = NULL;
    VSI_UNREFERENCED(param_size);
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
--- a/src/tim/vx/internal/src/kernel/cl/cast_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c
@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_cast_initializer)
    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
    vsi_size_array_t * out_shape                 = NULL;
    VSI_UNREFERENCED(param_size);
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
@ -251,6 +253,8 @@ static vsi_nn_kernel_node_t _setup
    vsi_bool image_2d = FALSE;
    vsi_nn_kernel_node_t node = NULL;
    VSI_UNREFERENCED(params);
    if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
                inputs[0]->attr.dim_num ) )
    {
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@ -128,6 +128,8 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
    vsi_size_array_t * out_shape                = NULL;
    VSI_UNREFERENCED(param_size);
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
--- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
@ -229,6 +229,8 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
    vsi_size_array_t * out_shape = NULL;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -285,7 +287,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_dtype_e output_dtype;
    vsi_status status = VSI_FAILURE;
    uint32_t key;
-    int i;
+    size_t i;
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
@ -347,6 +349,9 @@ static vsi_nn_kernel_node_t _setup
    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    ret = vsi_nn_kernel_optimize_eltwise_shape(
            inputs[0]->attr.size, inputs[0]->attr.dim_num,
            inputs[1]->attr.size, inputs[1]->attr.dim_num,
@ -363,11 +368,11 @@ static vsi_nn_kernel_node_t _setup
                outputs[0], shapes[2], new_rank );
 #define _swap_tensor(a, b, tmp)  \
-    do { \
+    { \
        tmp = a; \
        a = b; \
        b = tmp; \
-    } while(0)
+    }
        if (shapes[1][3] > shapes[0][3] && new_rank == 4)
        {
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@ -135,6 +135,8 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
    int32_t       c         = 1;
    uint32_t      dim       = 1;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@ -203,7 +205,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_dtype_e input0_dtype = U8;
    vsi_nn_kernel_dtype_e output_dtype = U8;
    uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -288,11 +290,28 @@ static vsi_nn_kernel_node_t _setup
    int32_t width      = 0;
    int32_t height     = 0;
    int32_t channel    = 1;
-    int32_t i = 0;
+    uint32_t i = 0;
-    vsi_nn_kernel_optimize_softmax_shape(
+    VSI_UNREFERENCED(input_num);
-                inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+    VSI_UNREFERENCED(output_num);
-                shapes[0], &rs_dim, &axis_new);
+
    if (axis < 0)
    {
        axis_new = 0;
        shapes[0][0] = 1;
        shapes[0][1] = 1;
        for (i = 0; i < inputs[0]->attr.dim_num; i++)
        {
            shapes[0][0] *= inputs[0]->attr.size[i];
        }
        rs_dim = 2;
    }
    else
    {
        vsi_nn_kernel_optimize_softmax_shape(
                    inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
                    shapes[0], &rs_dim, &axis_new);
    }
    if (rs_dim > 3)
    {
        return NULL;
--- a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
@ -103,6 +103,8 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
    int32_t     output_height = 0;
    int32_t     output_chn    = 0;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@ -145,7 +147,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_dtype_e input0_dtype = U8;
    vsi_nn_kernel_dtype_e output_dtype = U8;
    uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -195,6 +197,9 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_t node = NULL;
    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
@ -126,6 +126,9 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
    vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
    vsi_size_array_t * in_shape                 = NULL;
    VSI_UNREFERENCED(param_size);
    VSI_UNREFERENCED(node);
    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
    in_shape  = input_attr->shape;
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
@ -181,6 +181,14 @@ static vsi_nn_kernel_node_t _setup
 {
    vsi_nn_kernel_node_t node = NULL;
    VSI_UNREFERENCED(graph);
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(outputs);
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(params);
    VSI_UNREFERENCED(kernel);
    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@ -211,6 +211,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
    vsi_size_array_t * out_shape = NULL;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -253,7 +256,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_dtype_e output_dtype;
    vsi_status status = VSI_FAILURE;
    uint32_t key;
-    int i;
+    size_t i;
    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -327,6 +330,9 @@ static vsi_nn_kernel_node_t _setup
    float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
    float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    if (unary_type == UNARY_SELU)
    {
        alpha = alpha * beta;
--- a/src/tim/vx/internal/src/kernel/cl/erf_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/erf_cl.c
@ -135,6 +135,9 @@ DEF_KERNEL_INITIALIZER(_erf_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
    vsi_size_array_t * out_shape = NULL;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -243,6 +246,10 @@ static vsi_nn_kernel_node_t _setup
    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    VSI_UNREFERENCED(params);
    ret = vsi_nn_kernel_optimize_element_shape(
            inputs[0]->attr.size, inputs[0]->attr.dim_num,
            shape, &new_rank );
--- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
@ -122,11 +122,14 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
        {0, 0, 0},
        {0, 0, 0}
        };
-    vx_status     status             = VX_FAILURE;
+    vsi_status    status             = VSI_FAILURE;
-    vx_tensor     output              = (vx_tensor)param[2];
+    vx_tensor     output             = (vx_tensor)param[2];
    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
    vsi_size_array_t             *output_shape = NULL;
    VSI_UNREFERENCED(param_size);
    VSI_UNREFERENCED(node);
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
@ -258,6 +261,8 @@ static vsi_nn_kernel_node_t _setup
    float input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
    float input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
    VSI_UNREFERENCED(params);
    outputScale = 1.0f / outputScale;
    input0Tail   = -(input0Tail * input0Scale);
    input1Tail   = -(input1Tail * input1Scale);
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@ -205,6 +205,9 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
    size_t        input_dims1 = 0;
    size_t        i           = 0;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -264,7 +267,7 @@ static vsi_status _query_kernel
    vsi_nn_kernel_dtype_e input0_dtype = U8;
    vsi_nn_kernel_dtype_e output_dtype = U8;
    uint32_t key = 0;
-    int i = 0;
+    size_t i = 0;
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -334,6 +337,9 @@ static vsi_nn_kernel_node_t _setup
    int32_t is_array    = block_size >= GPU_TENSOR_MAX_WIDTH ? 1 : 0;
    int32_t i           = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
    status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
    status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);
--- a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
@ -51,18 +51,30 @@ typedef enum
 #define STR(a) #a
 // Add kernel hashtable here
-#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D ) \
+#define GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, IMG_2D, BEYOND_MAXWIDTH ) \
-        (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ))
+        (( AXIS ) | ( IN0_DTYPE << 2 ) | ( IN1_DTYPE << 10 ) | ( OUT_DTYPE << 18 ) | ( IMG_2D << 26 ) | \
        (BEYOND_MAXWIDTH << 28))
 #define PACK_KERNEL_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
-    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 0), \
    CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
    _GATHER_ELEMENTS_KERNEL_SOURCE}
 #define PACK_KERNEL_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
-    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
+    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 0), \
    CVIVANTE_NAMESPACE("cl.gather_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
    _GATHER_ELEMENTS_KERNEL_SOURCE}
 #define PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 , 1), \
    CVIVANTE_NAMESPACE("cl.gather_elements_beyond_maxwidth_axis"STR(AXIS)"_"STR(IN0_DTYPE)\
    "_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
    _GATHER_ELEMENTS_KERNEL_SOURCE}
 #define PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
    { GATHER_ELEMENTS_HASH_KEY( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 , 1), \
    CVIVANTE_NAMESPACE("cl.gather_elements_beyond_maxwidth_axis"STR(AXIS)"_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)\
    "to"STR(OUT_DTYPE)"_2D"), _GATHER_ELEMENTS_KERNEL_SOURCE}
 typedef struct
 {
    uint32_t key;
@ -89,6 +101,44 @@ static const _kernel_map_type _gather_elements_kernel_map[] =
    PACK_KERNEL_2D_MAP( 1, F32, I32, F32 ),
    PACK_KERNEL_2D_MAP( 1, I32, I32, I32 ),
    PACK_KERNEL_2D_MAP( 1, U32, I32, U32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, F32, I32, F32),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, F16, I32, F16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I32, I32, I32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I16, I32, I16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, I8,  I32, I8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 0, U8,  I32, U8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, F32, I32, F32),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, F16, I32, F16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I32, I32, I32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I16, I32, I16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, I8,  I32, I8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 1, U8,  I32, U8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, F32, I32, F32),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, F16, I32, F16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I32, I32, I32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I16, I32, I16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, I8,  I32, I8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_3D_MAP( 2, U8,  I32, U8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F32, I32, F32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, F16, I32, F16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I32, I32, I32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I16, I32, I16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, I8,  I32, I8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 0, U8,  I32, U8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F32, I32, F32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, F16, I32, F16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I32, I32, I32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I16, I32, I16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, I8,  I32, I8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 1, U8,  I32, U8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F32, I32, F32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, F16, I32, F16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I32, I32, I32 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I16, I32, I16 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, I8,  I32, I8 ),
    PACK_KERNEL_BEYOND_MAXWIDTH_2D_MAP( 2, U8,  I32, U8 ),
 };
@ -126,12 +176,38 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
        {0, 0, 0},
        {0, 0, 0}
        };
    vsi_nn_kernel_tensor_attr_t * input_attr0 = NULL;
    vsi_nn_kernel_tensor_attr_t * input_attr1 = NULL;
    vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
    vsi_size_array_t * out_shape              = NULL;
    uint32_t width0 = 0;
    uint32_t height0 = 0;
    uint32_t width1 = 0;
    uint32_t height1 = 0;
    uint32_t width_out = 0;
    uint32_t height_out = 0;
    uint32_t depth0 = 0;
    uint32_t depth1 = 0;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    input_attr0 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( input_attr0, "Create tensor attr buffer fail.", final );
    input_attr1 = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( input_attr1, "Create tensor attr buffer fail.", final );
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
    width0 = (uint32_t)input_attr0->shape->data[0];
    height0 = (uint32_t)input_attr0->shape->data[1];
    depth0 = input_attr0->shape->size > 2 ? (uint32_t)input_attr0->shape->data[2] : 1;
    width1 = (uint32_t)input_attr1->shape->data[0];
    height1 = (uint32_t)input_attr1->shape->data[1];
    depth1 = input_attr1->shape->size > 2 ? (uint32_t)input_attr1->shape->data[2] : 1;
    width_out = (uint32_t)output_attr->shape->data[0];
    height_out = (uint32_t)output_attr->shape->data[1];
    out_shape  = output_attr->shape;
    gpu_param.global_scale[0]  = 1;
@ -146,7 +222,25 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)
            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
            / gpu_param.global_scale[1]);
    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
    if (width0 >= GPU_TENSOR_MAX_WIDTH ||
        width1 >= GPU_TENSOR_MAX_WIDTH ||
        height0 >= GPU_TENSOR_MAX_WIDTH ||
        height1 >= GPU_TENSOR_MAX_WIDTH ||
        depth0 >= GPU_TENSOR_MAX_WIDTH ||
        depth1 >= GPU_TENSOR_MAX_WIDTH)
    {
        gpu_param.global_scale[0] = 1;
        gpu_param.global_size[0] = out_shape->data[0];
    }
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
    status |= vsi_nn_kernel_gpu_add_param( node, "width0", &width0 );
    status |= vsi_nn_kernel_gpu_add_param( node, "height0", &height0 );
    status |= vsi_nn_kernel_gpu_add_param( node, "width1", &width1 );
    status |= vsi_nn_kernel_gpu_add_param( node, "height1", &height1 );
    status |= vsi_nn_kernel_gpu_add_param( node, "width_out", &width_out );
    status |= vsi_nn_kernel_gpu_add_param( node, "height_out", &height_out );
 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
@ -178,32 +272,52 @@ static vsi_status _query_kernel
    int32_t img_2d = (outputs[0]->attr.dim_num < 3 || outputs[0]->attr.size[2] == 1) ? 1 : 0;
    uint32_t key = 0;
    uint32_t i;
    int32_t beyond_maxwidth = 0;
    vsi_size_t depth0 = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
    vsi_size_t depth1 = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1;
    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
    if (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
        inputs[0]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
        inputs[1]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH ||
        inputs[1]->attr.size[1] >= GPU_TENSOR_MAX_WIDTH ||
        depth0 >= GPU_TENSOR_MAX_WIDTH ||
        depth1 >= GPU_TENSOR_MAX_WIDTH)
    {
        beyond_maxwidth = 1;
    }
 #define _PACK_SELECT_KEY( in0_type, out_type ) \
    ( ( in0_type ) | ( out_type << 8 ))
-    switch (_PACK_SELECT_KEY(in0_dtype, out_dtype))
+    if (beyond_maxwidth == 0)
    {
-    case _PACK_SELECT_KEY(F32, F32):
+        switch (_PACK_SELECT_KEY(in0_dtype, out_dtype))
-    case _PACK_SELECT_KEY(F16, F16):
+        {
-        key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d );
+        case _PACK_SELECT_KEY(F32, F32):
-        break;
+        case _PACK_SELECT_KEY(F16, F16):
-    case _PACK_SELECT_KEY(U32, U32):
+            key = GATHER_ELEMENTS_HASH_KEY( axis, F32, in1_dtype, F32, img_2d, 0 );
-    case _PACK_SELECT_KEY(U16, U16):
+            break;
-    case _PACK_SELECT_KEY(U8,  U8):
+        case _PACK_SELECT_KEY(U32, U32):
-        key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d );
+        case _PACK_SELECT_KEY(U16, U16):
-        break;
+        case _PACK_SELECT_KEY(U8,  U8):
-    case _PACK_SELECT_KEY(I32, I32):
+            key = GATHER_ELEMENTS_HASH_KEY( axis, U32, in1_dtype, U32, img_2d, 0 );
-    case _PACK_SELECT_KEY(I16, I16):
+            break;
-    case _PACK_SELECT_KEY(I8,  I8):
+        case _PACK_SELECT_KEY(I32, I32):
-        key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d );
+        case _PACK_SELECT_KEY(I16, I16):
-        break;
+        case _PACK_SELECT_KEY(I8,  I8):
-    default:
+            key = GATHER_ELEMENTS_HASH_KEY( axis, I32, in1_dtype, I32, img_2d, 0 );
-        break;
+            break;
        default:
            break;
        }
    }
    else
    {
        key = GATHER_ELEMENTS_HASH_KEY( axis, in0_dtype, in1_dtype, out_dtype, img_2d, 1 );
    }
 #undef _PACK_SELECT_KEY
@ -221,7 +335,8 @@ static vsi_status _query_kernel
        kernel->info.numParams   = _cnt_of_array( _gather_elements_kernel_param_def );
        kernel->info.initialize  = initializer;
        // Register code source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
                "eltwise_ops_helper",
                kernel_map[i].source_name );
        // Register binary source
        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
--- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
@ -119,7 +119,7 @@ static vsi_status cal_gather_nd_tensor_reshape_size
    uint32_t block_size,
    uint32_t coordDim,
    int32_t* newDim,
-    int32_t  batch_dims
+    uint32_t  batch_dims
    )
 {
    vsi_status status = VSI_FAILURE;
@ -146,17 +146,23 @@ static vsi_status cal_gather_nd_tensor_reshape_size
        if (batch_dims)
        {
            int32_t rank = 1;
            for (i = 0; i < offset; i++)
            {
                sizes[0] *= input_size[i];
            }
-            for (i = 0; i < coordDim; i++)
+            for (i = 0; i < coordDim - 1; i++)
            {
-                sizes[i + 1] = input_size[i + offset];
+                sizes[rank++] = input_size[i + offset];
            }
-            newDim[0] = coordDim == 1 ? 2 : 3;
+            for (i = 0; i < batch_dims; i++)
            {
                sizes[rank] *= input_size[dims_num - i - 1];
            }
            newDim[0] = rank + 1;
        }
        else
        {
@ -186,13 +192,27 @@ static vsi_status cal_gather_nd_tensor_reshape_size
    }
    else  // indices&output reshape
    {
-        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH && batch_dims == 0)
        {
            sizes[0] = block_size;
            sizes[1] = elementCnt / block_size;
            status = VSI_SUCCESS;
            newDim[0] = 2;
        }
        else if (batch_dims > 0)
        {
            vsi_size_t batch_cnt = 1;
            for (i = 0; i < batch_dims; ++i)
            {
                batch_cnt *= input_size[dims_num - i - 1];
            }
            sizes[0] = block_size;
            sizes[1] = (elementCnt / block_size) / batch_cnt;
            sizes[2] = batch_cnt;
            status = VSI_SUCCESS;
            newDim[0] = 3;
        }
    }
 #undef VSI_NN_MAX_IMAGE_WIDTH
@ -220,7 +240,11 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
    int32_t       block_size  = 0;
-    vsi_ssize_t       indices_num = 1;
+    vsi_size_t    indices_num = 1;
    vsi_size_t    batch_num   = 1;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@ -229,6 +253,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
    CHECK_STATUS_FAIL_GOTO(status, final );
    indices_num = attr[0]->shape->data[1];
    batch_num = (attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1);
    gpu_param.global_scale[0]  = 1;
    gpu_param.global_scale[1]  = 1;
@ -237,7 +262,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
    gpu_param.global_size[0]   = gpu_align_p2((block_size + gpu_param.global_scale[0] - 1)
                                        / gpu_param.global_scale[0], 4);
    gpu_param.global_size[1]   = indices_num;
-    gpu_param.global_size[2]   = 1;
+    gpu_param.global_size[2]   = batch_num;
    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
    CHECK_STATUS_FAIL_GOTO(status, final);
@ -265,7 +290,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_dtype_e output_dtype = U8;
    vsi_nn_kernel_coord_type_e coord_type = _error;
    uint32_t key = 0;
-    int i = 0;
+    int32_t batch_flg = batch_dims > 0 ? 1 : 0;
    size_t i = 0;
    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
@ -301,7 +327,7 @@ static vsi_status _query_kernel
        coord_type = _3D;
    }
-    key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_dims );
+    key = HASH_GATHER_ND_KEY( input0_dtype, I32, output_dtype, coord_type, batch_flg );
    for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
    {
@ -348,6 +374,9 @@ static vsi_nn_kernel_node_t _setup
    int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    status = cal_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
    status |= cal_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
    status |= cal_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
--- a/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/globallppool_cl.c
@ -108,6 +108,9 @@ DEF_KERNEL_INITIALIZER(_globallppool_initializer)
    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
    vsi_size_array_t            *output_shape = NULL;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
    output_shape = output_attr->shape;
--- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
@ -220,6 +220,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
    vsi_ssize_t width = 0;
    vsi_ssize_t chn = 0;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -275,6 +278,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer)
    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
    vsi_ssize_t chn = 0;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@ -325,6 +331,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
    vsi_ssize_t chn = 0;
    int32_t is2D = 0;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@ -489,6 +498,9 @@ static vsi_nn_kernel_node_t _setup
    float rSpaceOrg = 1.0f / (width * height);
    float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_cl.c
@ -91,6 +91,9 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
    )
 {
    vsi_status status = VSI_FAILURE;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param);
    VSI_UNREFERENCED(param_size);
    // vsi_nn_kernel_tensor_attr * attr[2] = { NULL };
    // attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    // attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -172,6 +175,8 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    VSI_UNREFERENCED(params);
    /*
    // Check if gpu can support the size
    if( !vsi_nn_kernel_gpu_check_shape(
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_sma_cl.c
@ -91,6 +91,10 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer)
    )
 {
    vsi_status status = VSI_FAILURE;
    VSI_UNREFERENCED(node);
    VSI_UNREFERENCED(param);
    VSI_UNREFERENCED(param_size);
    // vsi_nn_kernel_tensor_attr * attr[2] = { NULL };
    // attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    // attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -172,6 +176,8 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_SMA_PARAM_NUM] = {NULL};
    vsi_nn_kernel_node_t node = NULL;
    VSI_UNREFERENCED(params);
    /*
    // Check if gpu can support the size
    if( !vsi_nn_kernel_gpu_check_shape(
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
@ -118,6 +118,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
    vsi_nn_kernel_tensor_t       input      = NULL;
    vsi_nn_kernel_tensor_attr_t* input_attr = NULL;
    VSI_UNREFERENCED(param_size);
    input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_HSTATE];
    input_attr = vsi_nn_kernel_tensor_attr_create( input );
--- a/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
@ -110,6 +110,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
    vsi_nn_kernel_tensor_t       output                 = NULL;
    vsi_nn_kernel_tensor_attr_t* output_attr;
    VSI_UNREFERENCED(param_size);
    output = (vsi_nn_kernel_tensor_t)param[3];
    output_attr = vsi_nn_kernel_tensor_attr_create( output );
--- a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
@ -120,6 +120,8 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
    vsi_nn_kernel_tensor_t       input      = NULL;
    vsi_nn_kernel_tensor_attr_t* input_attr = NULL;
    VSI_UNREFERENCED(param_size);
    input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_H_STATE];
    input_attr = vsi_nn_kernel_tensor_attr_create( input );
--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@ -188,6 +188,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
    vsi_ssize_t height = 0;
    vsi_ssize_t chn = 0;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
@ -255,6 +257,8 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
    vsi_ssize_t height = 0;
    vsi_ssize_t chn = 0;
    VSI_UNREFERENCED(param_size);
    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
@ -405,6 +409,9 @@ static vsi_nn_kernel_node_t _setup
    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
    float inv_multiplier = (float)1.0 / (float)(width * height);
    VSI_UNREFERENCED(input_num);
    VSI_UNREFERENCED(output_num);
    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                outputs[0]->attr.dim_num ) )
    {
--- a/Show More
+++ b/Show More
		`@ -1 +1 @@`
			`6.4.14_CL650117A_D650117_A648302_R647402_T648811_O646970`				`6.4.15_CL690884A_D690855_A690484_R690194_T690259_O688896`