diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION index 40da7fc..2524731 100644 --- a/prebuilt-sdk/x86_64_linux/VERSION +++ b/prebuilt-sdk/x86_64_linux/VERSION @@ -1 +1 @@ -REL/6.4.8 +REL/6.4.9 diff --git a/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h b/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h index 02286d8..3b85e85 100644 --- a/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h +++ b/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h @@ -349,75 +349,74 @@ enum eVXC_ERROR #define VXC_OP1(Op, Dest, Src0) _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, Src0) #define VXC_OP2(Op, Dest, Src0, Src1) \ - do { \ + { \ int _t1; \ - _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ - _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t1); \ - } while(0) + _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ + _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t1); \ + } #define VXC_OP3(Op, Dest, Src0, Src1, Src2) \ - do { \ + { \ int _t1, _t2; \ _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t2); \ - } while(0) + } #define VXC_OP3_NoDest(Op, Src0, Src1, Src2) \ - do { \ + { \ int _t1, _t2, _t3; \ _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ _viv_asm(INTRINSIC_ST, _t3, VXC_OP_##Op, _t2); \ - } while(0) - + } #define VXC_OP4(Op, Dest, Src0, Src1, Src2, Src3) \ - do { \ + { \ int _t1, _t2, _t3; \ _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t3); \ - } while(0) + } #define VXC_OP4_NoDest(Op, Src0, Src1, Src2, Src3) \ - do { \ + { \ int _t1, _t2, _t3, _t4; \ _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ _viv_asm(INTRINSIC_ST, _t4, VXC_OP_##Op, _t3); \ - } while(0) + } #define VXC_OP4_ST(Op, Dest, Src0, Src1, Src2, Src3) \ - do { \ + { \ int _t1, _t2, _t3; \ _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ _viv_asm(INTRINSIC_ST, Dest, VXC_OP_##Op, _t3);\ - } while(0) + } #define VXC_OP5(Op, Dest, Src0, Src1, Src2, Src3, Src4) \ - do { \ + { \ int _t1, _t2, _t3, _t4; \ _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ _viv_asm(PARAM_CHAIN, _t4, _t3, Src4); \ _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t4); \ - } while(0) + } #define VXC_OP5_NoDest(Op, Src0, Src1, Src2, Src3, Src4) \ - do { \ + { \ int _t1, _t2, _t3, _t4, _t5; \ _viv_asm(PARAM_CHAIN, _t1, Src0, Src1); \ _viv_asm(PARAM_CHAIN, _t2, _t1, Src2); \ _viv_asm(PARAM_CHAIN, _t3, _t2, Src3); \ _viv_asm(PARAM_CHAIN, _t4, _t3, Src4); \ _viv_asm(INTRINSIC_ST, _t5, VXC_OP_##Op, _t4); \ - } while(0) + } /* make sure the immediate value offsetX and offsetY are in range of [-16, 15] */ #define VXC_5BITOFFSET_XY(offsetX, offsetY) ((((offsetY) & 0x1F) << 5) | ((offsetX) & 0x1F)) @@ -515,41 +514,34 @@ enum eVXC_ERROR * Offset should be composed by using VXC_5BITOFFSET_XY(x, y) * Coord must be type of int4 or float4 */ -#define VXC_ReadImage2DArray(Dest, Image, Coord, Offset, Info) \ - do { \ - int8 desc; \ - _viv_asm(COPY, desc, Image, sizeof(desc)); \ - _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \ - int baseAddr = (int)(Coord).w *desc.s4 + desc.s0; \ - _viv_asm(MOV, (Coord).w, baseAddr); \ - VXC_OP4(img_load_3d, Dest, Image, (Coord).xyww, Offset, Info); \ - } while (0) -#define VXC_WriteImage2DArray(Image, Coord, Color, Info) \ - do { \ - int8 desc; \ - _viv_asm(COPY, desc, Image, sizeof(desc)); \ - _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \ - int baseAddr = (int)(Coord).w *(desc).s4 + desc.s0; \ - _viv_asm(MOV, (Coord).w, baseAddr); \ - VXC_OP4_NoDest(img_store_3d, Image, (Coord).xyww, Color, Info); \ - } while (0) +#define VXC_ReadImage2DArray(Dest, Image, OrigCoord, Offset, Info) \ + { \ + int8 desc; \ + int4 tempCoord = (int4)(OrigCoord.xyzz); \ + _viv_asm(COPY, desc, Image, sizeof(desc)); \ + _viv_asm(CLAMP0MAX, tempCoord.z, tempCoord.z, desc.s5 - 1); \ + tempCoord.z = tempCoord.z *desc.s4 + desc.s0; \ + VXC_OP4(img_load_3d, Dest, Image, tempCoord, Offset, Info); \ + } +#define VXC_WriteImage2DArray(Image, OrigCoord, Color, Info) \ + { \ + int8 desc; \ + int4 tempCoord = (int4)(OrigCoord.xyzz); \ + _viv_asm(COPY, desc, Image, sizeof(desc)); \ + _viv_asm(CLAMP0MAX, tempCoord.z, tempCoord.z, desc.s5 - 1); \ + tempCoord.z = tempCoord.z *desc.s4 + desc.s0; \ + VXC_OP4_NoDest(img_store_3d, Image, tempCoord, Color, Info); \ + } -/* image load/store for image3d_t, - * offset should be composed by using VXC_5BITOFFSET_XY(x, y) - * Coord must be type of int4 or float4 - */ -#define VXC_ReadImage3D(Dest, Image, Coord, Offset, Info) VXC_OP4(img_read_3d, Dest, Image, Coord, Offset, Info) -#define VXC_WriteImage3D(Image, Coord, Color, Info) VXC_OP4_NoDest(img_write_3d, Image, Coord, Color, Info) +#define VXC_Vload2(Dest, Pointer, Offset) { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload2, Dest, Pointer, byteOffset); } +#define VXC_Vload4(Dest, Pointer, Offset) { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload4, Dest, Pointer, byteOffset); } +#define VXC_Vload8(Dest, Pointer, Offset) { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload8, Dest, Pointer, byteOffset); } +#define VXC_Vload16(Dest, Pointer, Offset) { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload16, Dest, Pointer, byteOffset); } -#define VXC_Vload2(Dest, Pointer, Offset) do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload2, Dest, Pointer, byteOffset); } while(0) -#define VXC_Vload4(Dest, Pointer, Offset) do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload4, Dest, Pointer, byteOffset); } while(0) -#define VXC_Vload8(Dest, Pointer, Offset) do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload8, Dest, Pointer, byteOffset); } while(0) -#define VXC_Vload16(Dest, Pointer, Offset) do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload16, Dest, Pointer, byteOffset); } while(0) - -#define VXC_Vstore2(Pointer, Offset, Data) do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore2, Pointer, byteOffset, Data); } while(0) -#define VXC_Vstore4(Pointer, Offset, Data) do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore4, Pointer, byteOffset, Data); } while(0) -#define VXC_Vstore8(Pointer, Offset, Data) do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore8, Pointer, byteOffset, Data); } while(0) -#define VXC_Vstore16(Pointer, Offset, Data) do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore16, Pointer, byteOffset, Data); } while(0) +#define VXC_Vstore2(Pointer, Offset, Data) { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore2, Pointer, byteOffset, Data); } +#define VXC_Vstore4(Pointer, Offset, Data) { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore4, Pointer, byteOffset, Data); } +#define VXC_Vstore8(Pointer, Offset, Data) { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore8, Pointer, byteOffset, Data); } +#define VXC_Vstore16(Pointer, Offset, Data) { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore16, Pointer, byteOffset, Data); } /* VX2 only instructions*/ #define VXC_IndexAdd(Dest, Src0, Src1, Src2, Info) VXC_OP4(index_add, Dest, Src0, Src1, Src2, Info) @@ -562,7 +554,7 @@ enum eVXC_ERROR #if (VX_VERSION == 2) #define VXC_BiLinear(Dest, Src0, Src1, Src2, Info) \ - do { \ + { \ int endBin = ((Info) & VXC_END_BIN_BITMASK) >> 8; \ int roundMode = ((Info) & VXC_ROUNDING_MODE_BITMASK) >> 2; \ int clamp = ((Info) & VXC_CLAMP_BITMASK) >> 22; \ @@ -576,7 +568,7 @@ enum eVXC_ERROR _viv_asm(PARAM_CHAIN, bi4, bi3, 8); \ _viv_asm(INTRINSIC, bi2, OP_bit_extract, bi4); \ VXC_Lerp(Dest, bi2!, bi2.y!, (Src2).x, Info); \ - } while (0) + } #define VXC_BitReplace(Dest, Src0, Src1, Src2, Info) /* BitReplace definition here */ #define VXC_IAdd(Dest, Src0, Src1, Src2, Info) /* IAdd definition here */ @@ -592,7 +584,8 @@ enum eVXC_ERROR #define VXC_Filter_Max(Dest, Src0, Src1, Src2, Info) /* Max filter definition here */ #define VXC_Filter_Min(Dest, Src0, Src1, Src2, Info) /* Min filter definition here */ #define VXC_Filter_Median(Dest, Src0, Src1, Src2, Info) /* Median filter definition here */ -#define VXC_Filter(Dest, Src0, Src1, Src2, Info) do { \ +#define VXC_Filter(Dest, Src0, Src1, Src2, Info) \ + { \ int filter = (((Info) >> 16)&0x0F); \ if (filter == VXC_FM_BOX) { VXC_Filter_Box(Dest, Src0, Src1, Src2, Info); } \ if (filter == VXC_FM_Guassian) { VXC_Filter_Guassian(Dest, Src0, Src1, Src2, Info); } \ @@ -603,7 +596,7 @@ enum eVXC_ERROR if (filter == VXC_FM_Max) { VXC_Filter_Max(Dest, Src0, Src1, Src2, Info); } \ if (filter == VXC_FM_Min) { VXC_Filter_Min(Dest, Src0, Src1, Src2, Info); } \ if (filter == VXC_FM_Median) { VXC_Filter_Median(Dest, Src0, Src1, Src2, Info); } \ - } while (0) + } #else /* VX1 */ diff --git a/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h b/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h index 6c1e9f5..b6cc5be 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h @@ -98,7 +98,9 @@ vxCreateTensor_11( vx_enum data_format, vx_int8 fixed_point_pos ); +#if !VX_VA40_EXT_SUPPORT #define vxCreateTensor vxCreateTensor_11 +#endif /* keep the backward compatibility with spec 1.1 for vxCreateVirtualTensor */ VX_API_ENTRY vx_tensor VX_API_CALL @@ -108,8 +110,11 @@ vxCreateVirtualTensor_11( vx_uint32 *sizes, vx_enum data_format, vx_int8 fixed_point_pos -); +); + +#if !VX_VA40_EXT_SUPPORT #define vxCreateVirtualTensor vxCreateVirtualTensor_11 +#endif /* keep the backward compatibility with spec 1.1 for vxCreateTensorFromView */ VX_API_ENTRY vx_tensor VX_API_CALL diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h index 6c3671e..782961c 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h @@ -496,6 +496,8 @@ enum vx_kernel_e { VX_KERNEL_NN_BATCH_GEMM = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2F, + VX_KERNEL_NN_CONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x30, + VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */ }; diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h index e3baa23..d6d9b93 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h @@ -33,44 +33,58 @@ 0: weight_layout is whnc 1: weight_layout is whcn */ +#ifndef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS #define VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS 1 +#endif /* VX_CONVERT_POLICY_WRAP_ENABLE is used to differentiate two overflow_policys(VX_CONVERT_POLICY_WRAP and VX_CONVERT_POLICY_SAT) [value] 0: both overflow_policys considered as VX_CONVERT_POLICY_SAT 1: overflow_policy is determined by arguments. */ +#ifndef VX_CONVERT_POLICY_WRAP_ENABLE #define VX_CONVERT_POLICY_WRAP_ENABLE 1 +#endif +#ifndef VX_13_NN_COMPATIBLITY #define VX_13_NN_COMPATIBLITY 1 +#endif /* VX_L2NORM_AXIS_PARAMETER_SUPPORT is used to declare that L2NORMALIZE can support axis parameter [value] 0: not support 1: support */ +#ifndef VX_L2NORM_AXIS_PARAMETER_SUPPORT #define VX_L2NORM_AXIS_PARAMETER_SUPPORT 1 +#endif /* VX_SOFTMAX_AXIS_PARAMETER_SUPPORT is used to declare that SOFTAMX can support axis parameter [value] 0: not support 1: support */ +#ifndef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT #define VX_SOFTMAX_AXIS_PARAMETER_SUPPORT 1 +#endif /* VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT is used to declare that NORMALIZATION can support axis parameter [value] 0: not support 1: support */ +#ifndef VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT #define VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT 1 +#endif /* VX_ACTIVATION_EXT_SUPPORT is used to declare that ACTIVATION can support swish and hswish [value] 0: not support 1: support */ +#ifndef VX_ACTIVATION_EXT_SUPPORT #define VX_ACTIVATION_EXT_SUPPORT 1 +#endif /* VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT is used to query more hardware parameter such as shader sub-group size. @@ -78,7 +92,19 @@ 0: not support 1: support */ +#ifndef VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT #define VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT 1 +#endif + +/* + VX_VA40_EXT_SUPPORT is used to declare that openvx can support VA40. + [value] + 0: not support + 1: support +*/ +#ifndef VX_VA40_EXT_SUPPORT +#define VX_VA40_EXT_SUPPORT 0 +#endif /* VX_USER_LOOKUP_TABLE_SUPPORT is used to declare that openvx can support user lookuptable. @@ -86,7 +112,9 @@ 0: not support 1: support */ +#ifndef VX_USER_LOOKUP_TABLE_SUPPORT #define VX_USER_LOOKUP_TABLE_SUPPORT 1 +#endif /* VX_PRELOAD_CONST_TENSOR_SUPPORT is used to declare that openvx can support preload weight/bias and const tensor @@ -94,7 +122,9 @@ VX_PRELOAD_CONST_TENSOR_SUPPORT is used to declare that openvx can support prelo 0: not support 1: support(NN conv and TP FC weightbias, and SH const tensor) */ +#ifndef VX_PRELOAD_CONST_TENSOR_SUPPORT #define VX_PRELOAD_CONST_TENSOR_SUPPORT 1 +#endif /* VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support physical address for vxCreateTensorFromHandle @@ -102,7 +132,9 @@ VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support phy 0: not support 1: support */ +#ifndef VX_CREATE_TENSOR_SUPPORT_PHYSICAL #define VX_CREATE_TENSOR_SUPPORT_PHYSICAL 1 +#endif /* VX_GRAPH_PREEMPTION_SUPPORT is used to declare that openvx can support different graph preemption function. @@ -110,7 +142,9 @@ VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support phy 0: not support 1: support */ +#ifndef VX_GRAPH_PREEMPTION_SUPPORT #define VX_GRAPH_PREEMPTION_SUPPORT 1 +#endif /* VX_BATCH_GEMM_API_SUPPORT is used to declare that vsi openvx driver can support vxBatchGemmNode API to transform gemm to convolution @@ -118,6 +152,18 @@ VX_BATCH_GEMM_API_SUPPORT is used to declare that vsi openvx driver can support 0: not support 1: support */ +#ifndef VX_BATCH_GEMM_API_SUPPORT #define VX_BATCH_GEMM_API_SUPPORT 1 +#endif + +/* +VX_CONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support conv3d by vxConv3dLayer API. + [value] + 0: not support + 1: support +*/ +#ifndef VX_CONV_3D_API_SUPPORT +#define VX_CONV_3D_API_SUPPORT 1 +#endif #endif /* __VX_KHR_COMPATIBLE_H__ */ diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h index 6f1c478..88a9967 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h @@ -29,6 +29,7 @@ #define OPENVX_KHR_NN "vx_khr_nn" #include +#include #include @@ -310,10 +311,47 @@ enum vx_tensor_lifetime_type_e VX_TENSOR_LIFE_TIME_DYNAMIC, }; +typedef struct _vx_nn_convolution_3d_params_t +{ + vx_int32 padding_w_left; /*!< \brief Number of elements added at each side in the left of w dimension of the input. */ + vx_int32 padding_w_right; /*!< \brief Number of elements added at each side in the right of w dimension of the input. */ + vx_int32 padding_h_top; /*!< \brief Number of elements added at each side in the top of h dimension of the input. */ + vx_int32 padding_h_bottom; /*!< \brief Number of elements added at each side in the bottom of h dimension of the input. */ + vx_int32 padding_d_front; /*!< \brief Number of elements added at each side in the front of d dimension of the input. */ + vx_int32 padding_d_rear; /*!< \brief Number of elements added at each side in the rear of d dimension of the input. */ + + vx_int32 stride_w; /*!< \brief skip w jump for down scale. */ + vx_int32 stride_h; /*!< \brief skip h jump for down scale. */ + vx_int32 stride_d; /*!< \brief skip d jump for down scale. */ + vx_int32 dilation_w; /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the w direction. The value is the number of zeros to insert.*/ + vx_int32 dilation_h; /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the h direction. The value is the number of zeros to insert.*/ + vx_int32 dilation_d; /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the d direction. The value is the number of zeros to insert.*/ + + vx_enum pad_mode; /*!< \brief A VX_TYPE_ENUM of the \ref vx_pad_mode_e enumeration. */ + vx_scalar pad_const; /*!< \brief pad const value if setting pad mode to const, the const value is base value, not quantized value. */ + + vx_enum overflow_policy; /*!< \brief A VX_TYPE_ENUM of the vx_convert_policy_e enumeration. */ + vx_enum rounding_policy; /*!< \brief A VX_TYPE_ENUM of the vx_round_policy_e enumeration. */ + vx_enum down_scale_size_rounding; /*!< \brief Rounding method for calculating output dimensions. See \ref vx_nn_rounding_type_e */ + + vx_int32 depth_multiplier; /*!< \brief depthwise multiplier value, if 0, means convolution, elsewise(>=1), the convolution is depthwiseconvolution. */ +}vx_nn_convolution_3d_params_t; + /*============================================================================== TENSOR DATA FUNCTIONS =============================================================================*/ - +#if VX_VA40_EXT_SUPPORT +/*! \brief Create an opaque reference to a tensor view object. + * \details Not guaranteed to exist until the vx_graph containing it has been verified. + * \param [in] context The reference to the implementation context. + * \param [in] view_array_start a vx_size array of start values of the view. + * \param [in] view_array_end a vx_size array of end values of the view. + * \param [in] numViewDimensions number of dimensions of view_array_start and view_array_end. + * \return A tensor data view reference or zero when an error is encountered. + * \ingroup group_tensor + */ +VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, vx_size* view_array_start, vx_size* view_array_end, vx_size numViewDimensions); +#else /*! \brief Create an opaque reference to a tensor view object. * \details Not guaranteed to exist until the vx_graph containing it has been verified. * \param [in] context The reference to the implementation context. @@ -324,6 +362,7 @@ enum vx_tensor_lifetime_type_e * \ingroup group_tensor */ VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, vx_uint32 *view_array_start, vx_uint32 * view_array_end, vx_uint8 numViewDimensions); +#endif /*! \brief Releases a reference to a tensor data view object. * The object may not be garbage collected until its total reference count is zero. @@ -337,6 +376,18 @@ VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, v */ VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorView(vx_tensor_view *tensor_view); +#if VX_VA40_EXT_SUPPORT +/*! \brief Create an opaque reference to a tensor addressing object. +* \details Not guaranteed to exist until the vx_graph containing it has been verified. +* \param [in] context The reference to the implementation context. +* \param [in] addressing_array_dimension a vx_size array of sLength of patch in all dimensions in elements. +* \param [in] addressing_array_stride a vx_size arrayStride in all dimensions in bytes. +* \param [in] numViewDimensions number of dimensions of view_array_start and view_array_end. +* \return A tensor data view reference or zero when an error is encountered. +* \ingroup group_tensor +*/ +VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_size* addressing_array_dimension, vx_size* addressing_array_stride, vx_size numViewDimensions); +#else /*! \brief Create an opaque reference to a tensor addressing object. * \details Not guaranteed to exist until the vx_graph containing it has been verified. * \param [in] context The reference to the implementation context. @@ -346,7 +397,8 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorView(vx_tensor_view *tensor_vi * \return A tensor data view reference or zero when an error is encountered. * \ingroup group_tensor */ -VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_uint32 *addressing_array_dimension, vx_uint32 * addressing_array_stride, vx_uint8 numViewDimensions); +VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_uint32 * addressing_array_dimension, vx_uint32 * addressing_array_stride, vx_uint8 numViewDimensions); +#endif /*! \brief Releases a reference to a tensor data addressing object. * The object may not be garbage collected until its total reference count is zero. @@ -402,7 +454,11 @@ typedef union _vx_tensor_quant_param typedef struct _vx_tensor_create_params_t { vx_uint32 num_of_dims; /*!< \brief The number of dimensions specified in *sizes*/ +#if VX_VA40_EXT_SUPPORT + vx_size * sizes; /*!< \brief The pointer to an array of dimension */ +#else vx_uint32 * sizes; /*!< \brief The pointer to an array of dimension */ +#endif vx_enum data_format; /*!< \brief Data format for the tensor */ vx_enum quant_format; /*!< \brief Quantized format \ref vx_quantized_format_e . */ vx_tensor_quant_param quant_data; @@ -482,7 +538,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromHandle2( */ VX_API_ENTRY vx_status VX_API_CALL vxFlushHandle(vx_reference ref); - +#if VX_VA40_EXT_SUPPORT +/*! \brief Return a new tensor referencing the same memory location but with different shape. +* \param [in] tensor The input tensor data to reshape. +* \param [in] num_of_dims Size of each dimension. If one component is special value -1, +* the size of that dimension is computed so that the total size remains the same as input tensor. +* If is is [-1], then flatten is performed which turns tensor into 1-D. +* \param [in] sizes The size of the container to which \a num_of_dims points. +* \return a vx_tensor that has shaped. +* \return VX_NULL if an error occurred. +* \ingroup group_tensor +*/ +VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_size* num_of_dims, vx_size sizes); +#else /*! \brief Return a new tensor referencing the same memory location but with different shape. * \param [in] tensor The input tensor data to reshape. * \param [in] num_of_dims Size of each dimension. If one component is special value -1, @@ -494,6 +562,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxFlushHandle(vx_reference ref); * \ingroup group_tensor */ VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_int32* num_of_dims, vx_uint32 sizes); +#endif /*! \brief Allows setting attributes on the tensor. * \param [in] tensor The reference to the tensor on which to set the attribute. @@ -1961,6 +2030,7 @@ typedef struct _vx_hardware_caps_params_ext_t { vx_hardware_caps_params_t base; vx_uint32 subGroupSize; /*!< \brief shader sub-group size.*/ + vx_bool supportVA40; /*!< \brief support 40bit virtual address.*/ } vx_hardware_caps_params_ext_t; /*! \brief Queries hardware caps information. @@ -1979,6 +2049,29 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryHardwareCaps( vx_size size_of_hardware_caps_param ); +/*! \brief [Graph] Creates a Convolutional-3d Network Convolution Layer Node. + * \details This function implement Convolutional-3d Network Convolution layer. + * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined, + * and should be at least 16.\n + * round: rounding according the vx_round_policy_e enumeration. \n + * saturate: A saturation according the vx_convert_policy_e enumeration. + * \param [in] graph The handle to the graph. + * \param [in] inputs The input tensor data. 4 lower dimensions represent a single input, all following dimensions represent number of batches, possibly nested. + * The dimension order is [width, height, depth, #IFM, #batches].\n + * \param [in] weights [*static] Weights are 5d tensor with dimensions [kernel_x, kernel_y, kernel_d, #IFM, #OFM]. + * see \ref vxCreateTensor2 and \ref vxCreateVirtualTensor2 \n Weights data type must match the data type of the inputs. (Kernel parameter #1) + * \param [in] biases [*static] Optional, ignored if NULL. The biases, which may be shared (one per ofm) or unshared (one per ofm * output location). The possible layouts are + * either [#OFM] or [width, height, #OFM]. Biases data type must match the data type of the inputs. + * \param [in] convolution_params [static] Pointer to parameters of type \ref vx_nn_convolution_3d_params_t. + * \param [in] size_of_convolution_params [static] Size in bytes of convolution_params. Note that this parameter is not counted as one of the kernel parameters. + * \param [out] outputs The output tensor data. Output will have the same number and structure of dimensions as input. Output tensor data type must be same as the inputs. + * \return vx_node. + * \returns A node reference \ref vx_node. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * \ingroup group_cnn + */ +VX_API_ENTRY vx_node VX_API_CALL vxConv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_convolution_3d_params_t *convolution_params, vx_size size_of_convolution_params, vx_tensor outputs); + #ifdef __cplusplus } #endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h index 41e1653..506938f 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h @@ -289,6 +289,169 @@ typedef struct _vx_weights_biases_parameter_optimizations_ext2_t { vx_int8 output_fpp_dw; /*depthwise conv output fix-point*/ } vx_weights_biases_parameter_optimizations_ext2_t; +#if VX_VA40_EXT_SUPPORT +/*! + * \brief Creates a reference to a vx_weights_biases_parameter opaque object. + * + * \param [in] layer_type The network type of objects to hold. Types allowed are: + * \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer. + * \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer. + * \param [in] num_of_dims The dimention number of input & output image tensor. + * \param [in] inputs_dims The input tensor's dimension size. + * \param [in] pad_x The number of elements subtracted at each side in the x dimension of the input. + * \param [in] pad_y The number of elements subtracted at each side in the y dimension of the input. + * \param [in] pooling_size_x The size of the pooling region in the x dimension, 0 means no pooling operation. + * \param [in] pooling_size_y The size of the pooling region in the y dimension, 0 means no pooling operation. + * \param [in] down_scale_size_rounding A VX_TYPE_ENUM of the vx_round_policy_e enumeration. + * \param [in] convolution_outputs_dims The output's dimension size after covolution operation. + * \param [in] pool_outputs_dims The output's dimension size after pooling operation. + * \param [in] optimizations A optional param for \ref vx_weights_biases_parameter_optimizations_t. + * \param [in] weights The weights tensor which need be compressed. + * \param [in] biases The biases tensor which need be compressed. + * + * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_cnn + */ +VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL +vxCreateWeightsBiasesParameterFromTensors( + vx_enum layer_type, + vx_size num_of_dims, + vx_size * inputs_dims, + vx_uint32 pad_x, + vx_uint32 pad_y, + vx_uint32 pooling_size_x, + vx_uint32 pooling_size_y, + vx_enum down_scale_size_rounding, + vx_size * convolution_outputs_dims, + vx_size * pool_outputs_dims, + vx_weights_biases_parameter_optimizations_t *optimizations, + vx_tensor weights, + vx_tensor biases); + +/*! + * \brief Creates a reference to an opaque vx_weights_biases_parameter object. + * + * \param [in] layer_type The network type of objects to hold. Types allowed are: + * \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer. + * \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer. + * \param [in] num_of_dims The dimention number of input & output image tensor. + * \param [in] inputs_dims The input tensor's dimension size. + * \param [in] convolution_outputs_dims The output's dimension size after covolution operation. + * \param [in] pool_outputs_dims The output's dimension size after pooling operation. + * \param [in] output_format The output tensor element type. + * \param [in] convolution_relu_pooling_params The convolution_relu_pooling_params Pointer to parameters of type \ref vx_nn_convolution_relu_pooling_params_t + * \param [in] size_of_convolution_relu_pooling_params The size in bytes of convolution_relu_pooling_params. + * \param [in] optimizations A optional param for \ref vx_weights_biases_parameter_optimizations_t. + * \param [in] weights The weights tensor which need be compressed. + * \param [in] biases The biases tensor which need be compressed. + * + * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_cnn + */ +VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParameterFromTensors2( + vx_enum layer_type, + vx_size num_of_dims, + vx_size * inputs_dims, + vx_size * convolution_outputs_dims, + vx_size * pool_outputs_dims, + vx_enum output_format, + const vx_nn_convolution_relu_pooling_params convolution_relu_pooling_params, + vx_size size_of_convolution_relu_pooling_params, + vx_weights_biases_parameter_optimizations_t *optimizations, + vx_tensor weights, + vx_tensor biases); + +/*! + * \brief Creates a reference to an opaque vx_weights_biases_parameter object. + * + * \param [in] layer_type The network type of objects to hold. Types allowed are: + * \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer. + * \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer. + * \param [in] inputs_dims The input tensor's dimension size. + * \param [in] convolution_outputs_dims The output's dimension size after covolution operation. + * \param [in] pool_outputs_dims The output's dimension size after pooling operation. + * \param [in] convolution_relu_pooling_params The convolution_relu_pooling_params Pointer to parameters of type \ref vx_nn_convolution_relu_pooling_params_t + * \param [in] size_of_convolution_relu_pooling_params The size in bytes of convolution_relu_pooling_params. + * \param [in] optimizations A optional param for \ref vx_weights_biases_parameter_optimizations_t. + * \param [in] size_of_optimizations The size in bytes of optimizations. + * \param [in] weights The weights tensor which need be compressed. + * \param [in] biases The biases tensor which need be compressed. + * + * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_cnn + */ +VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParameterFromTensors3( + vx_enum layer_type, + vx_size * inputs_dims, + vx_size * convolution_outputs_dims, + vx_size * pool_outputs_dims, + const vx_nn_convolution_relu_pooling_params convolution_relu_pooling_params, + vx_size size_of_convolution_relu_pooling_params, + vx_weights_biases_parameter_optimizations_t *optimizations, + vx_size size_of_optimizations, + vx_tensor weights, + vx_tensor biases); + +/*! + * \brief Creates a reference to an vx_weights_biases_parameter object. + * \param [in] context The OpenVX context object. + * \param [in] layer_type The network type of objects to hold. Types allowed are: + * \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer. + * \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer. + * \param [in] num_of_dims The dimention number of input & output image tensor. + * \param [in] inputs_dims The input tensor's dimension size. + * \param [in] pad_x The number of elements subtracted at each side in the x dimension of the input. + * \param [in] pad_y The number of elements subtracted at each side in the y dimension of the input. + * \param [in] pooling_size_x The size of the pooling region in the x dimension, 0 means no pooling operation. + * \param [in] pooling_size_y The size of the pooling region in the y dimension, 0 means no pooling operation. + * \param [in] down_scale_size_rounding A VX_TYPE_ENUM of the vx_round_policy_e enumeration. + * \param [in] convolution_outputs_dims The output's dimension size after covolution operation. + * \param [in] pool_outputs_dims The output's dimension size after pooling operation. + * \param [in] weights_num_of_dims The dimention number of weights tensor. + * \param [in] weights_dims The dimention size of weights tensor. + * \param [in] weights_data_format The format of weights tensor. + * \param [in] weights_fixed_point_pos The fixed point position when the weights element type is int16/int8, if 0 calculations are performed in integer math. + * \param [in] biases_num_of_dims The dimention number of biases tensor. + * \param [in] biases_dims The dimention size of biases tensor. + * \param [in] biases_data_format The format of biases tensor. + * \param [in] biases_fixed_point_pos The fixed point position when the biases element type is int16/int8, if 0 calculations are performed in integer math. + * \param [in] raw_data_size The data size of compressed data. + * + * \returns A weightsbiases reference without compressed kernel data vx_weights_biases_parameter. Any possible errors preventing a + * successful creation should be checked using \ref vxGetStatus. + * + * \ingroup group_cnn + */ +VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL +vxCreateWeightsBiasesParameter( + vx_context context, + vx_enum layer_type, + vx_size num_of_dims, + vx_size * inputs_dims, + vx_uint32 pad_x, + vx_uint32 pad_y, + vx_uint32 pooling_size_x, + vx_uint32 pooling_size_y, + vx_enum down_scale_size_rounding, + vx_size * convolution_outputs_dims, + vx_size * pool_outputs_dims, + vx_size weights_num_of_dims, + vx_size * weights_dims, + vx_enum weights_data_format, + vx_int8 weights_fixed_point_pos, + vx_size biases_num_of_dims, + vx_size * biases_dims, + vx_enum biases_data_format, + vx_int8 biases_fixed_point_pos, + vx_uint32 raw_data_size + ); +#else /*! * \brief Creates a reference to a vx_weights_biases_parameter opaque object. * @@ -397,17 +560,6 @@ VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParame vx_tensor weights, vx_tensor biases); -/*! \brief Releases the OpenVX object vx_weights_biases_parameter. - * \param [in] weights_bias The pointer to the reference to the vx_weights_biases_parameter. - * \post After returning from this function the reference is zeroed. - * \return A \ref vx_status_e enumeration. - * \retval VX_SUCCESS No errors. - * \retval VX_ERROR_INVALID_REFERENCE If weights_bias is not a vx_weights_biases_parameter. - * \pre \ref vxCreateWeightsBiasesParameterFromTensors / vxCreateWeightsBiasesParameterFromTensors2/ vxCreateWeightsBiasesParameter / vxCreateWeightsBiasesParameterFromStream - * \ingroup group_cnn - */ -VX_API_ENTRY vx_status VX_API_CALL vxReleaseWeightsBiasesParameter(vx_weights_biases_parameter *weights_bias); - /*! * \brief Creates a reference to an vx_weights_biases_parameter object. * \param [in] context The OpenVX context object. @@ -461,7 +613,18 @@ vxCreateWeightsBiasesParameter( vx_int8 biases_fixed_point_pos, vx_uint32 raw_data_size ); +#endif +/*! \brief Releases the OpenVX object vx_weights_biases_parameter. + * \param [in] weights_bias The pointer to the reference to the vx_weights_biases_parameter. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If weights_bias is not a vx_weights_biases_parameter. + * \pre \ref vxCreateWeightsBiasesParameterFromTensors / vxCreateWeightsBiasesParameterFromTensors2/ vxCreateWeightsBiasesParameter / vxCreateWeightsBiasesParameterFromStream + * \ingroup group_cnn + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseWeightsBiasesParameter(vx_weights_biases_parameter *weights_bias); /*! \brief Input parameters for a gru operation. * \ingroup group_cnn * \version 0.5 @@ -900,6 +1063,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorTableLookupLayer( vx_lut InLut, vx_lut OutLut, vx_tensor output); + #ifdef __cplusplus } #endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h index 0881c15..51bf129 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h @@ -444,6 +444,11 @@ enum vx_type_e { * \ingroup group_basic_features */ enum vx_status_e { + VX_ERROR_VENDOR_VSI_END = -2000, /*!< \brief A vendor defined error status end base. */ + /* add new error here*/ + VX_ERROR_CANCEL_JOB = -1001, /*!< \brief Indicates that a VIP job was cancelled. */ + VX_ERROR_VENDOR_VSI_START = -1000, /*!< \brief A vendor defined error status start base. */ + VX_STATUS_MIN = -25,/*!< \brief Indicates the lower bound of status codes in VX. Used for bounds checks only. */ /* add new codes here */ VX_ERROR_REFERENCE_NONZERO = -24,/*!< \brief Indicates that an operation did not complete due to a reference count being non-zero. */ @@ -718,6 +723,8 @@ enum vx_graph_state_e { VX_GRAPH_STATE_ABANDONED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x3, /*! \brief The graph execution is completed and the graph is not scheduled for execution */ VX_GRAPH_STATE_COMPLETED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x4, + /*! \brief The graph execution was cancelled */ + VX_GRAPH_STATE_CANCELLED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x5, }; /*! \brief The graph attributes list. diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h index f97512f..e31ba0d 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h @@ -53,6 +53,19 @@ VX_API_ENTRY vx_status VX_API_CALL vxSysSetVipFrequency( vx_uint32 shaderFscaleValue ); +/*! \brief cancel all VIP processing jobs. + * \param [in] context The reference to the implementation context. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS Cancelled all VIP processing job successfully + * and user can check return of vxProcessGraph() to get cancelled status. + * \retval VX_ERROR_INVAID_PARAMETERS Invalid context reference. + * \retval VX_ERROR_NOT_SUPPORTED Hardware does not support job cancellation. + * \retval VX_FAILURE Failed to cancel VIP proccessing job. + */ +VX_API_ENTRY vx_status VX_API_CALL vxSysCancelJob( + vx_context context + ); + #ifdef __cplusplus } #endif diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so index 575b344..4831755 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so index ff87c25..e9101a5 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so index dbd7197..2d30e1e 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so index 0439666..690ba12 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so index 9a4e15c..6a2cefc 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 index 99ec9c8..29fffa4 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so index 44e37de..e33fc05 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so index 50c2a10..0d2a6c0 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so index 07646f8..e8b7c99 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ diff --git a/src/tim/vx/internal/BUILD b/src/tim/vx/internal/BUILD index 186c6a9..392f1ec 100644 --- a/src/tim/vx/internal/BUILD +++ b/src/tim/vx/internal/BUILD @@ -25,7 +25,6 @@ filegroup( srcs = glob([ "include/kernel/cl/*.h", "include/kernel/evis/*.h", - "include/kernel/cpu/*.h", ]) ) @@ -34,7 +33,6 @@ filegroup( srcs = glob([ "src/kernel/cl/*.c", "src/kernel/evis/*.c", - "src/kernel/cpu/*.c", "src/kernel/vx/*.c", ]) ) @@ -137,6 +135,7 @@ cc_library( "include/kernel/vsi_nn_kernel_eltwise.h", "include/kernel/vsi_nn_kernel_node.h", "include/kernel/vsi_nn_kernel_gpu_shape_optimize.h", + "include/kernel/vsi_nn_kernel_lut.h", "include/vsi_nn_error.h", # libnnext @@ -193,6 +192,7 @@ cc_library( "src/kernel/vsi_nn_kernel_selector.c", "src/kernel/vsi_nn_kernel_node.c", "src/kernel/vsi_nn_kernel_param.c", + "src/kernel/vsi_nn_kernel_lut.c", "src/kernel/vsi_nn_gpu.c", "src/kernel/vsi_nn_kernel_gpu_shape_optimize.c", "src/libnnext/vsi_nn_libnnext_resource.c", diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index 0590aad..cf5bebb 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -163,3 +163,5 @@ DEF_OP(CONV2D_LSTM_CELL) DEF_OP(GRU) DEF_OP(GRUCELL) DEF_OP(GRUCELL_ACTIVATION) +DEF_OP(RESHAPE2) +DEF_OP(CONV3D) diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def index ab04552..06dbc61 100644 --- a/src/tim/vx/internal/include/internal/internal_ops.def +++ b/src/tim/vx/internal/include/internal/internal_ops.def @@ -17,3 +17,5 @@ DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA) DEF_OP(RESIZE_1D_BILINEAR_INTERNAL) DEF_OP(RESIZE_1D_NEAREST_INTERNAL) DEF_OP(SPACE2DEPTH_INTERNAL) +DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R) +DEF_OP(GRUCELL_ACTIVATION_Z_H) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h index 73cfcd7..05222b2 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h @@ -48,6 +48,7 @@ typedef enum VSI_NN_KERNEL_TYPE_EVIS, VSI_NN_KERNEL_TYPE_CL, VSI_NN_KERNEL_TYPE_VX, + VSI_NN_KERNEL_TYPE_SP, VSI_NN_KERNEL_TYPE_NUM, VSI_NN_KERNEL_TYPE_NONE = VSI_NN_KERNEL_TYPE_NUM } vsi_nn_kernel_type_e; @@ -75,7 +76,9 @@ typedef enum F32, F64, BF16, - BOOL8 + BOOL8, + I4, + U4, } vsi_nn_kernel_dtype_e; typedef enum @@ -303,6 +306,8 @@ const void * vsi_nn_kernel_param_get_const_buffer REGISTER_KERNEL_BACKEND(operation, CPU, func) #define REGISTER_BACKEND_OPENVX(operation, func) \ REGISTER_KERNEL_BACKEND(operation, VX, func) +#define REGISTER_BACKEND_STREAM_PROCESSOR(operation, func) \ + REGISTER_KERNEL_BACKEND(operation, SP, func) #define DEF_KERNEL_BASE_CALLBACK( NAME ) \ static vsi_status NAME##_impl( vsi_nn_kernel_node_t node, \ @@ -478,6 +483,10 @@ static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype { switch( dtype ) { + case VSI_NN_TYPE_INT4: + return I4; + case VSI_NN_TYPE_UINT4: + return U4; case VSI_NN_TYPE_INT8: return I8; case VSI_NN_TYPE_BOOL8: @@ -514,6 +523,10 @@ static inline vsi_nn_type_e vsi_nn_dtype_map_kernel { switch( dtype ) { + case I4: + return VSI_NN_TYPE_INT4; + case U4: + return VSI_NN_TYPE_UINT4; case I8: return VSI_NN_TYPE_INT8; case BOOL8: @@ -572,6 +585,38 @@ static inline size_t vsi_nn_kernel_dtype_get_bytes return 0; } /* vsi_nn_kernel_dtype_get_bytes() */ +static inline vsi_size_t vsi_nn_kernel_dtype_get_bits + ( + vsi_nn_kernel_dtype_e dtype + ) +{ + switch( dtype ) + { + case I4: + case U4: + return 4; + case I8: + case U8: + case BOOL8: + return 8; + case I16: + case U16: + case F16: + case BF16: + return 16; + case I32: + case U32: + case F32: + return 32; + case I64: + return 64; + default: + VSILOGE("Error data type %d", dtype); + break; + } + return 0; +} /* vsi_nn_kernel_dtype_get_bits() */ + static inline vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type ( vsi_nn_qnt_type_e quant_type ) { @@ -615,6 +660,12 @@ static inline void vsi_nn_kernel_scalar_release } } /* vsi_nn_kernel_scalar_relase() */ +vsi_status vsi_nn_kernel_scalar_read_uint4 + ( vsi_nn_kernel_scalar_t scalar, uint8_t * out_data ); + +vsi_status vsi_nn_kernel_scalar_read_int4 + ( vsi_nn_kernel_scalar_t scalar, int8_t * out_data ); + vsi_status vsi_nn_kernel_scalar_read_int8 ( vsi_nn_kernel_scalar_t scalar, int8_t * out_data ); @@ -751,25 +802,90 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes ( const vsi_nn_kernel_tensor_attr_t * attr ) { - vsi_size_t size; - vsi_size_t type_bytes; + vsi_size_t i = 0; + vsi_size_t bytes; + vsi_size_t bits_num; + vsi_size_t * shape = NULL; if( !attr ) { return 0; } - size = vsi_nn_kernel_tensor_attr_get_size( attr ); - type_bytes = (vsi_size_t)vsi_nn_kernel_dtype_get_bytes( attr->dtype ); - return size * type_bytes; + + shape = attr->shape->data; + + bits_num = vsi_nn_kernel_dtype_get_bits( attr->dtype ); + if ( bits_num < BITS_PER_BYTE ) + { + if (shape[0] % 2 == 0) + { + bytes = shape[0] / 2; + } + else + { + bytes = shape[0] / 2 + shape[0] % 2; + } + } + else + { + bytes = shape[0] * bits_num / BITS_PER_BYTE; + } + for ( i = 1; i < (vsi_size_t)attr->shape->size; i ++ ) + { + bytes *= shape[i]; + } + + return bytes; } /* vsi_nn_kernel_tensor_attr_get_bytes() */ static inline void vsi_nn_kernel_tensor_attr_get_stride ( const vsi_nn_kernel_tensor_attr_t * attr, vsi_size_t * out_stride) { + vsi_size_t type_bits; + vsi_size_t total_bytes; + vsi_size_t * shape = NULL; + if( !attr || !out_stride ) { return; } - vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, out_stride ); + + shape = attr->shape->data; + type_bits = vsi_nn_kernel_dtype_get_bits( attr->dtype ); + + if ( type_bits < BITS_PER_BYTE ) + { + vsi_size_t i; + + out_stride[0] = type_bits / BITS_PER_BYTE; + total_bytes = out_stride[0]; + + total_bytes = 1; + if ( shape[0] % (BITS_PER_BYTE / type_bits) == 0 ) + { + out_stride[1] = shape[0] * type_bits / BITS_PER_BYTE; + } + else + { + out_stride[1] = shape[0] * type_bits / BITS_PER_BYTE + 1; + } + + total_bytes *= out_stride[1]; + for (i = 2; i < (vsi_size_t)attr->shape->size; i++) + { + out_stride[i] = shape[i - 1] * out_stride[i - 1]; + total_bytes *= shape[i]; + } + total_bytes *= shape[1]; + + for( i = (vsi_size_t)attr->shape->size; i < VSI_NN_MAX_DIM_NUM; i ++ ) + { + out_stride[i] = total_bytes; + } + } + else + { + vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, out_stride ); + } } /* vsi_nn_kernel_tensor_attr_get_size() */ static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized @@ -903,12 +1019,115 @@ static inline const char* vsi_nn_kernel_type_str return "CL"; case VSI_NN_KERNEL_TYPE_VX: return "OPENVX"; + case VSI_NN_KERNEL_TYPE_SP: + return "STERAM_PROCESSOR"; default: break; } return "None"; } /* vsi_nn_kernel_type_str() */ +static inline vsi_status vsi_nn_kernel_unpack_4bit_data + ( + const vsi_nn_kernel_tensor_attr_t * attr, + uint8_t * src, + uint8_t * dest, + vsi_nn_kernel_dtype_e dtype + ) +{ + vsi_status status; + uint32_t i = 0, j = 0; + uint8_t high = 0, low = 0; + vsi_size_t stride[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t src_size; + + status = VSI_SUCCESS; + vsi_nn_kernel_tensor_attr_get_stride( attr, stride ); + + src_size = stride[attr->shape->size]; + + for ( i = 0 ; i < src_size; i++) + { + high = src[i] >> 4; + low = src[i] & 0x0F; + if ( dtype == I4 ) + { + if( high > 7) + { + high = high | 0xF0; + } + if( low > 7) + { + low = low | 0xF0; + } + } + if ( attr->shape->data[0] % stride[1] == 0 ) + { + if ( attr->shape->data[0] == 1 ) + { + dest[j] = low; + j++; + } + else + { + dest[j] = low; + dest[j+1] = high; + j += 2; + } + } + else + { + if ( (i+1) % stride[1] == 0 ) + { + dest[j] = low; + j++; + } + else + { + dest[j] = low; + dest[j+1] = high; + j += 2; + } + } + } + + return status; +} + +static inline vsi_status vsi_nn_kernel_pack_4bit_data + ( + const vsi_nn_kernel_tensor_attr_t * attr, + uint8_t * src, + uint8_t * dest + ) +{ + vsi_status status; + uint32_t i = 0, j = 0; + uint8_t high = 0, low = 0; + vsi_size_t src_size; + + status = VSI_SUCCESS; + src_size = vsi_nn_kernel_tensor_attr_get_size( attr ); + for ( i = 0; i < src_size; i++ ) + { + if ( (i+1) % attr->shape->data[0] == 0) + { + high = 0; + low = src[i]; + } + else + { + high = src[i+1]; + low = src[i]; + i++; + } + dest[j] = (high << 4) | (low & 0xF); + j++; + } + + return status; +} + __END_DECLS #endif diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h new file mode 100644 index 0000000..f5da0f1 --- /dev/null +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h @@ -0,0 +1,75 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_KERNEL_LUT_H +#define _VSI_NN_KERNEL_LUT_H + +#include + +__BEGIN_DECLS + +typedef int32_t vsi_nn_kernel_lut_act_e; enum +{ + VSI_NN_KERNEL_LUT_NONE = 0, + VSI_NN_KERNEL_LUT_MISH = 1, + VSI_NN_KERNEL_LUT_LOG = 2, + VSI_NN_KERNEL_LUT_EXP = 3, + VSI_NN_KERNEL_LUT_ELU = 4, + VSI_NN_KERNEL_LUT_NEG = 5, + VSI_NN_KERNEL_LUT_HSIGMOID = 6, + VSI_NN_KERNEL_LUT_SOFT_PLUS = 7, + VSI_NN_KERNEL_LUT_ERF = 8, + VSI_NN_KERNEL_LUT_GELU = 9, + VSI_NN_KERNEL_LUT_HGELU = 10, + VSI_NN_KERNEL_LUT_RELU_KERAS = 11, + VSI_NN_KERNEL_LUT_CLIP = 12, + VSI_NN_KERNEL_LUT_SQUARE = 13, +}; + +#define VSI_NN_KERNEL_LUT_MAX_SIZE (1024) +#define VSI_NN_KERNEL_LUT_FP16_MAX (57344) +#define VSI_NN_KERNEL_LUT_FP16_MIN (-57344) + +typedef struct _vsi_nn_kernel_lut_ +{ + float index; + float val; +} vsi_nn_kernel_lut_t; + +typedef struct _vsi_nn_kernel_lut_params +{ + vsi_enum act_type; + float params[16]; +} vsi_nn_kernel_lut_params; + +vsi_status vsi_nn_kernel_lut + ( + vx_lut index_lut, + vx_lut output_lut, + vsi_nn_kernel_lut_params *param + ); + +__END_DECLS + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h index 7f43ec8..e9d1b70 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h @@ -26,6 +26,9 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif #define VSI_NN_ARGMIN_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ VSI_NN_ARGMIN_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, @@ -110,5 +113,8 @@ typedef struct _vsi_nn_argmin_param int32_t axis; } vsi_nn_argmin_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h b/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h index fcdd425..fb88141 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h @@ -26,10 +26,17 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_axis_aligned_bbox_transform_param { vsi_enum type; } vsi_nn_axis_aligned_bbox_transform_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h b/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h index 36ccbfc..f4a4ffe 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + /* enum for inputs/outputs */ enum { @@ -50,5 +54,8 @@ typedef struct _vsi_nn_batchnorm_single_param float eps; } vsi_nn_batchnorm_single_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h index b183d9a..8a4e7cb 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { BI_LSTM_INPUT_INPUT = 0, @@ -132,5 +136,8 @@ typedef struct _vsi_nn_bidirectional_sequence_lstm_param vsi_nn_dtype_t *internal_dtype; } vsi_nn_bidirectional_sequence_lstm_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h index 1c59ee3..2bf8c77 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" #include "vsi_nn_op_rnn.h" +#ifdef __cplusplus +extern "C" { +#endif + /* enum for inputs/outputs */ enum { @@ -62,5 +66,8 @@ typedef struct _vsi_nn_bidirectional_sequence_rnn_param vsi_nn_dtype_t* internal_dtype; } vsi_nn_bidirectional_sequence_rnn_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h b/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h index b4af7e4..505ae8e 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_box_with_nms_limit_param { float score_threshold; @@ -36,5 +40,8 @@ typedef struct _vsi_nn_box_with_nms_limit_param float nms_score_threshold; } vsi_nn_box_with_nms_limit_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h b/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h index 86fa568..fd9d3d0 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h @@ -27,11 +27,18 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_cast_param { // Add parameters here int32_t nothing; } vsi_nn_cast_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h index e0eac95..919413e 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { CONV2D_LSTM_IN_INPUT = 0, @@ -73,4 +77,8 @@ typedef struct _vsi_nn_conv2d_lstm_param vsi_nn_conv2d_param conv2d; } vsi_nn_conv2d_lstm_param; +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h index bd306ad..9b83aad 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define CONV2D_LSTM_CELL_GATE_NUM 4 // i,f,c,o enum @@ -73,4 +77,8 @@ typedef struct _vsi_nn_conv2d_lstm_cell_param vsi_nn_conv2d_param conv2d; } vsi_nn_conv2d_lstm_cell_param; +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h new file mode 100644 index 0000000..bf8bf2b --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h @@ -0,0 +1,58 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_CONV3D_H +#define _VSI_NN_OP_CONV3D_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_conv3d_param +{ + struct _conv3d_local_data_t* local; + // Add parameters here + /*w, h, d*/ + int32_t ksize[3]; + int32_t stride[3]; + int32_t dilation[3]; + + /* Pad left, right, top, bottom, front, rear*/ + int32_t pad[6]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; + int32_t weights; + + int32_t multiplier; +} vsi_nn_conv3d_param; +_compiler_assert(offsetof(vsi_nn_conv3d_param, local) == 0, \ + vsi_nn_conv3d_h ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h b/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h index 90fa87e..c37e50f 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_detection_postprocess_param { float dy; @@ -41,5 +45,8 @@ typedef struct _vsi_nn_detection_postprocess_param int32_t is_bg_in_label; } vsi_nn_detection_postprocess_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h b/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h index be7de22..4b5c16e 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h @@ -26,6 +26,9 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif #define _VSI_NN_EXP_LOCAL_TENSOR_NUM 2 @@ -42,5 +45,8 @@ typedef struct _vsi_nn_exp_param vsi_nn_exp_lcl_data local; } vsi_nn_exp_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h b/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h index 4eff2d0..38e132d 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define _VSI_NN_EXTRA_ENDING_LOCAL_TENSOR_NUM 3 typedef struct _vsi_nn_extra_ending_lcl_data @@ -44,5 +48,8 @@ typedef struct _vsi_nn_extra_ending_param int length; } vsi_nn_extra_ending_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h b/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h index 4066939..ae70b9c 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h @@ -26,10 +26,17 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_floor_param { vsi_enum type; } vsi_nn_floor_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h index 5cb011c..dad8b37 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h @@ -27,11 +27,17 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_gelu_param { vsi_bool approximate; } vsi_nn_gelu_param; - +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h b/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h index 1d5a365..cbe786b 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_generate_proposals_param { float height_stride; @@ -37,5 +41,8 @@ typedef struct _vsi_nn_generate_proposals_param int32_t type; } vsi_nn_generate_proposals_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h index f9470ee..fa571e9 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h @@ -27,11 +27,14 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _grouped_conv1d_local_data_t { vsi_nn_tensor_t* input; vsi_nn_tensor_t* weight; vsi_nn_tensor_t* output; - } grouped_conv1d_local_data_t; typedef struct _vsi_nn_grouped_conv1d_param @@ -50,6 +53,8 @@ typedef struct _vsi_nn_grouped_conv1d_param int32_t multiplier; } vsi_nn_grouped_conv1d_param; - +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h index 721ebbc..59858c0 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_grouped_conv2d_param { uint32_t ksize[2]; @@ -41,5 +45,8 @@ typedef struct _vsi_nn_grouped_conv2d_param void* local; } vsi_nn_grouped_conv2d_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h index fc2c24d..4985192 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + /* Define the inputs and outputs for GRU Layer */ enum { @@ -74,5 +78,8 @@ typedef struct _vsi_nn_gru_param _compiler_assert(offsetof(vsi_nn_gru_param, local) == 0, \ vsi_nn_gru_h ); +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h index b4da1fc..19e4172 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" #include "vsi_nn_op_grucell_ovxlib.h" +#ifdef __cplusplus +extern "C" { +#endif + /* enum for inputs/outputs */ enum { @@ -74,5 +78,8 @@ typedef struct _vsi_nn_gru_ovxlib_param uint32_t cudnn_implementation_version; } vsi_nn_gru_ovxlib_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h index 8407bda..da0c08e 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { GRUCELL_GATES_Z = 0, @@ -81,4 +85,8 @@ typedef struct _vsi_nn_grucell_param _compiler_assert(offsetof(vsi_nn_grucell_param, local) == 0, \ vsi_nn_conv1d_h ); +#ifdef __cplusplus +} +#endif + #endif \ No newline at end of file diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h index 67a25e5..5eef114 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h @@ -26,11 +26,18 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { - GRUCELL_ACT_IN_H_STATE = 0, - GRUCELL_ACT_IN_INPUT_FC_H = 1, - GRUCELL_ACT_IN_H_T = 2, - GRUCELL_ACT_IN_Z_T = 3, + GRUCELL_ACT_H_STATE = 0, + GRUCELL_ACT_I_FC_Z = 1, + GRUCELL_ACT_I_FC_R = 2, + GRUCELL_ACT_I_FC_H = 3, + GRUCELL_ACT_H_FC_Z = 4, + GRUCELL_ACT_H_FC_R = 5, + GRUCELL_ACT_H_FC_H = 6, GRUCELL_ACT_IN_CNT, @@ -45,8 +52,13 @@ typedef struct _vsi_nn_grucell_activation_param struct _vsi_nn_grucell_activation_local * local; vsi_nn_activation_e activation; + vsi_nn_activation_e recurrent_activation; } vsi_nn_grucell_activation_param; _compiler_assert(offsetof(vsi_nn_grucell_activation_param, local) == 0, \ vsi_nn_grucell_activation_h ); +#ifdef __cplusplus +} +#endif + #endif \ No newline at end of file diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h index fe11a36..7b73d5a 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { GRUCELL_ACTIVATION_INPUT_ZT_ = 0, GRUCELL_ACTIVATION_INPUT_HT__ = 1, @@ -83,5 +87,8 @@ typedef struct _vsi_nn_grucell_activation_internal_param grucell_activation_input_layout_e input_layout; } vsi_nn_grucell_activation_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h index 51d76a4..555c81d 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { GRUCELL_ACTIVATION_SMA_INPUT_H_STATE = 0, GRUCELL_ACTIVATION_SMA_INPUT_H_T_ = 1, @@ -47,5 +51,8 @@ typedef struct _vsi_nn_grucell_activation_internal_sma_param vsi_nn_grucell_activation_internal_sma_local* local; } vsi_nn_grucell_activation_internal_sma_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_z_h.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_z_h.h new file mode 100644 index 0000000..70dc295 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_z_h.h @@ -0,0 +1,63 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_GRUCELL_ACTIVATION_Z_H_H +#define _VSI_NN_OP_GRUCELL_ACTIVATION_Z_H_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + GRUCELL_ACT_Z_H_HSTATE = 0, + GRUCELL_ACT_Z_H_I_FC_Z = 1, + GRUCELL_ACT_Z_H_I_FC_H = 2, + GRUCELL_ACT_Z_H_H_FC_Z = 3, + GRUCELL_ACT_Z_H_H_FC_H = 4, + + GRUCELL_ACT_Z_H_IN_CNT, + + GRUCELL_ACT_Z_H_OUT_OUTPUT = 0, + GRUCELL_ACT_Z_H_OUT_HSTATE = 1, + + GRUCELL_ACT_Z_H_OUT_CNT +}; + +typedef struct _vsi_nn_grucell_activation_z_h_param +{ + struct _grucell_activation_z_h_local_data_t* local; + // Add parameters here + vsi_nn_activation_e activation; + vsi_nn_activation_e recurrent_activation; +} vsi_nn_grucell_activation_z_h_param; +_compiler_assert(offsetof(vsi_nn_grucell_activation_z_h_param, local) == 0, \ + vsi_nn_grucell_activation_z_h_h ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_h_times_activation_r.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_h_times_activation_r.h new file mode 100644 index 0000000..84695f2 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_h_times_activation_r.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R_H +#define _VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_grucell_h_times_activation_r_param +{ + struct _grucell_h_times_activation_r_local_data_t* local; + + vsi_nn_activation_e recurrent_activation; +} vsi_nn_grucell_h_times_activation_r_param; +_compiler_assert(offsetof(vsi_nn_grucell_h_times_activation_r_param, local) == 0, \ + vsi_nn_grucell_h_times_activation_r_h ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h index 6006952..d53ee6b 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" #include "vsi_nn_op_grucell_ovxlib.h" +#ifdef __cplusplus +extern "C" { +#endif + #define GRUCELL_RZ_GATE_COUNT 2 /* enum for inputs/outputs */ @@ -103,4 +107,8 @@ typedef struct _vsi_nn_grucell_ovxlib_param _compiler_assert(offsetof(vsi_nn_grucell_ovxlib_param, local) == 0, \ vsi_nn_vsi_nn_grucell_ovxlib_h ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_hard_sigmoid.h b/src/tim/vx/internal/include/ops/vsi_nn_op_hard_sigmoid.h new file mode 100644 index 0000000..c16d04d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_hard_sigmoid.h @@ -0,0 +1,46 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_HARD_SIGMOID_H +#define _VSI_NN_OP_HARD_SIGMOID_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_hard_sigmoid_param +{ + void* local; + // Add parameters here + float alpha; + float beta; +} vsi_nn_hard_sigmoid_param; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h b/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h index d1bdf04..4da1c79 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h @@ -26,10 +26,17 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_heatmap_max_keypoint_param { vsi_enum type; } vsi_nn_heatmap_max_keypoint_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h b/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h index 5f1bfb2..5f52eb5 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_interp_param { struct _interp_local_data_t* local; @@ -38,7 +42,8 @@ typedef struct _vsi_nn_interp_param int32_t pad_end; //padding at end of intput } vsi_nn_interp_param; - - +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_log.h b/src/tim/vx/internal/include/ops/vsi_nn_op_log.h index 362f4da..8def574 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_log.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_log.h @@ -26,6 +26,9 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif #define VSI_NN_LOG_SH_KERNEL_IDX(_INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ VSI_NN_LOG_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, @@ -113,6 +116,8 @@ typedef struct _vsi_nn_log_param vsi_nn_log_lcl_data local; } vsi_nn_log_param; - +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h b/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h index 26f3baf..913b5ce 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ VSI_NN_LOGSOFTMAX_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, @@ -150,5 +154,8 @@ typedef struct _vsi_nn_log_softmax_param int32_t axis; } vsi_nn_log_softmax_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h index 099c645..6df03f5 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef enum { VSI_NN_LSH_PROJECTION_SPARSE = 1, @@ -37,5 +41,8 @@ typedef struct _vsi_nn_lsh_projection_param vsi_nn_lsh_projection_type_e type; } vsi_nn_lsh_projection_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h index cf0ed9f..29c8cd1 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" #include "vsi_nn_op_lstmunit.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { LSTM_INPUT_INPUT = 0, @@ -100,5 +104,8 @@ typedef struct _vsi_nn_lstm_ovxlib_param uint32_t weights; /* compatible with LSTM, NOT used */ } vsi_nn_lstm_ovxlib_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h index 08a9254..fa1389b 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" #include "vsi_nn_op_lstmunit.h" +#ifdef __cplusplus +extern "C" { +#endif + /* c -> cifg, l -> layer norm, p -> projection, h -> peephole, b -> hybrid bias fp32, s -> standard*/ enum { @@ -96,5 +100,8 @@ typedef struct _vsi_nn_lstmunit_activation_param vsi_nn_activation_e recurrent_activation; } vsi_nn_lstmunit_activation_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h index eaac01d..cc53d4c 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h @@ -28,6 +28,10 @@ #include "vsi_nn_types.h" #include "vsi_nn_op_lstmunit.h" +#ifdef __cplusplus +extern "C" { +#endif + #define LSTMUNIT_IFCO_GATE_COUNT 4 /* enum for inputs/outputs */ @@ -274,4 +278,8 @@ typedef struct _vsi_nn_lstmunit_ovxlib_param vsi_nn_dtype_t *internal_dtype_aux; } vsi_nn_lstmunit_ovxlib_param; +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h b/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h index ea85174..5ede2ad 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h @@ -26,6 +26,9 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif #define _VSI_NN_ELU_LOCAL_TENSOR_NUM 2 @@ -34,5 +37,8 @@ typedef struct _vsi_nn_neg_param vx_tensor local_tensor[_VSI_NN_ELU_LOCAL_TENSOR_NUM]; } vsi_nn_neg_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h b/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h index 174bb10..9f4b18c 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_nms_param { int32_t max_output_size; @@ -35,4 +39,8 @@ typedef struct _vsi_nn_nms_param float soft_nms_sigma; } vsi_nn_nms_param; +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h b/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h index 5cad574..28f3c64 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_one_hot_param { struct _one_hot_local_data_t* local; @@ -39,4 +43,8 @@ typedef struct _vsi_nn_one_hot_param _compiler_assert(offsetof(vsi_nn_one_hot_param, local) == 0, \ vsi_nn_one_hot_h ); +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h index 3ffe93f..160bc06 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { POST_PROCESS_INPUT = 0, @@ -53,5 +57,8 @@ typedef struct _vsi_nn_post_process_param vsi_nn_post_process_lcl_data local; } vsi_nn_post_process_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h index 035320a..3f61413 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" #include "vsi_nn_pre_post_process.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef vsi_nn_preprocess_source_format_e vsi_nn_pre_process_type_e; enum @@ -80,5 +84,9 @@ typedef struct _vsi_nn_pre_process_param vsi_nn_pre_process_lcl_data *local; } vsi_nn_pre_process_param; + +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h index 6b7add6..d01fba8 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_pre_process_bgra_lcl_data { int32_t scale_x; @@ -65,5 +69,8 @@ typedef struct _vsi_nn_pre_process_bgra_param vsi_nn_pre_process_bgra_lcl_data local; } vsi_nn_pre_process_bgra_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h index 459e25d..604184f 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { PRE_PROCESS_GRAY_INPUT = 0, @@ -67,5 +71,8 @@ typedef struct _vsi_nn_pre_process_gray_param vsi_nn_pre_process_gray_lcl_data local; } vsi_nn_pre_process_gray_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h index 63e9335..da52fa0 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h @@ -26,6 +26,9 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif enum { @@ -77,5 +80,8 @@ typedef struct _vsi_nn_pre_process_rgb_param vsi_nn_pre_process_rgb_lcl_data local; } vsi_nn_pre_process_rgb_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h index b70094f..efe64e4 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { PRE_PROCESS_TENSOR_INPUT = 0, @@ -53,5 +57,8 @@ typedef struct _vsi_nn_pre_process_tensor_param vsi_nn_pre_process_tensor_lcl_data local; } vsi_nn_pre_process_tensor_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h index ec127d2..8e178d6 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + enum { Q16_LSTM_INPUT_INPUT = 0, @@ -60,5 +64,8 @@ typedef struct _vsi_nn_quantized_16bit_lstm_param void* local; } vsi_nn_quantized_16bit_lstm_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h b/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h index 34b7769..cd862a6 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h @@ -26,10 +26,17 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_random_multinomial_param { int32_t sample_num; } vsi_nn_random_multinomial_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h index 24cca15..0df389e 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define VSI_NN_REDUCEALL_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ VSI_NN_REDUCEALL_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, @@ -68,5 +72,8 @@ typedef struct _vsi_nn_reduceall_internal_param vx_bool keep_dim; } vsi_nn_reduceall_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h index a316c82..babdb69 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define VSI_NN_REDUCEANY_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ VSI_NN_REDUCEANY_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, @@ -68,5 +72,8 @@ typedef struct _vsi_nn_reduceany_internal_param vx_bool keep_dim; } vsi_nn_reduceany_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h index 9219983..b2ff2cb 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define VSI_NN_REDUCEMAX_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ VSI_NN_REDUCEMAX_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, @@ -113,5 +117,8 @@ typedef struct _vsi_nn_reducemax_internal_param vx_bool keep_dim; } vsi_nn_reducemax_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h index ee32dd1..5f4ae52 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define VSI_NN_REDUCEMIN_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ VSI_NN_REDUCEMIN_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, @@ -113,5 +117,8 @@ typedef struct _vsi_nn_reducemin_internal_param vx_bool keep_dim; } vsi_nn_reducemin_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h index b2c830d..2a7f8a7 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define VSI_NN_REDUCEPROD_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \ VSI_NN_REDUCEPROD_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL, @@ -118,5 +122,8 @@ typedef struct _vsi_nn_reduceprod_internal_param vx_bool keep_dim; } vsi_nn_reduceprod_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h index 69ca355..337df79 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_reducesum_lcl_data_t { vsi_nn_tensor_t *reshaped_input; @@ -40,5 +44,8 @@ typedef struct _vsi_nn_reducesum_internal_param vsi_nn_reducesum_lcl_data_t* local; } vsi_nn_reducesum_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h index fdc582d..02fb3ba 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_relu_keras_param { float alpha; @@ -33,5 +37,8 @@ typedef struct _vsi_nn_relu_keras_param float threshold; } vsi_nn_relu_keras_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h index 4e30fb9..b7bccda 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define _VSI_NN_RELU_KERAS_INTERNAL_LOCAL_TENSOR_NUM 2 typedef struct _vsi_nn_relu_keras_internal_lcl_data @@ -44,5 +48,8 @@ typedef struct _vsi_nn_relu_keras_internal_param float threshold; } vsi_nn_relu_keras_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h index a41377a..1b5ca0b 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h @@ -37,7 +37,7 @@ typedef struct _vsi_nn_reshape_lcl_data typedef struct _vsi_nn_reshape_param { - const vsi_size_t * size; + const uint32_t * size; uint32_t dim_num; /* reshape layer local data structure */ diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape2.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape2.h new file mode 100644 index 0000000..863a0ff --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape2.h @@ -0,0 +1,53 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_RESHAPE2_H +#define _VSI_NN_OP_RESHAPE2_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_reshape2_local_data +{ + vsi_bool initialized; +} vsi_nn_reshape2_local_data; + +typedef struct _vsi_nn_reshape2_param +{ + vsi_nn_reshape2_local_data* local; + // Add parameters here + const vsi_size_t * size; + uint32_t dim_num; +} vsi_nn_reshape2_param; +_compiler_assert(offsetof(vsi_nn_reshape2_param, local) == 0, \ + vsi_nn_reshape2_h ); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h index 50270a1..aaa72c6 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h @@ -41,10 +41,15 @@ typedef uint32_t vsi_nn_interpolation_type_t; enum VSI_NN_INTERPOLATION_AREA }; -typedef struct _vsi_nn_resize_lcl_data +typedef uint32_t vsi_nn_resize_layout_type_t; enum { - vx_tensor local_tensor[_VSI_NN_RESIZE_LOCAL_TENSOR_NUM]; -} vsi_nn_resize_lcl_data; + VSI_NN_RESIZE_LAYOUT_NCHW = 0, + VSI_NN_RESIZE_LAYOUT_NHWC +}; + +typedef struct _vsi_nn_resize_local_data { + vsi_bool use_internal_node; +} vsi_nn_resize_local_data; typedef struct _vsi_nn_resize_param { @@ -53,9 +58,16 @@ typedef struct _vsi_nn_resize_param int32_t size[2]; /* resize layer local data structure */ - vsi_nn_resize_lcl_data local; + union + { + vsi_nn_resize_local_data *lcl_data; + struct { + vx_tensor local_tensor[_VSI_NN_RESIZE_LOCAL_TENSOR_NUM]; + } reserved; + }; vsi_bool align_corners; vsi_bool half_pixel_centers; + vsi_enum layout; } vsi_nn_resize_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h index e85aa74..d996d04 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_resize_1d_param { struct _resize_1d_local_data_t* local; @@ -40,5 +44,8 @@ typedef struct _vsi_nn_resize_1d_param _compiler_assert(offsetof(vsi_nn_resize_1d_param, local) == 0, \ vsi_nn_resize_1d_h ); +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h index 4e119c8..6948db3 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_resize_1d_bilinear_internal_param { struct _resize_1d_bilinear_internal_local_data_t* local; @@ -38,5 +42,8 @@ typedef struct _vsi_nn_resize_1d_bilinear_internal_param _compiler_assert(offsetof(vsi_nn_resize_1d_bilinear_internal_param, local) == 0, \ vsi_nn_resize_1d_bilinear_internal_h ); +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h index cc94051..a18af71 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_resize_1d_nearest_internal_param { struct _resize_1d_nearest_internal_local_data_t* local; @@ -38,5 +42,8 @@ typedef struct _vsi_nn_resize_1d_nearest_internal_param _compiler_assert(offsetof(vsi_nn_resize_1d_nearest_internal_param, local) == 0, \ vsi_nn_resize_1d_nearest_internal_h ); +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h index 578d943..6adc896 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h @@ -26,6 +26,9 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif typedef struct _vsi_nn_resize_in_lcl_data { @@ -38,8 +41,12 @@ typedef struct _vsi_nn_resize_internal_param vsi_nn_resize_in_lcl_data *lcl_data_ptr; vsi_bool align_corners; vsi_bool half_pixel_centers; - float factor; + float factor; + vsi_enum layout; } vsi_nn_resize_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h index b700334..3f29b1c 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_resize_nearest_in_lcl_data { uint32_t hash_idx; @@ -40,6 +44,8 @@ typedef struct _vsi_nn_resize_nearest_internal_param float factor; } vsi_nn_resize_nearest_internal_param; - +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h index 1f48989..b1da46d 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" #include "vsi_nn_op_rnn.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_rnncell_ovxlib_lcl_data_t { vsi_bool multi_batch; @@ -40,5 +44,8 @@ typedef struct _vsi_nn_rnncell_ovxlib_param vsi_nn_dtype_t* internal_dtype; } vsi_nn_rnncell_ovxlib_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h index e61a33b..e24f043 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_roi_align_param { int32_t output_height; @@ -36,5 +40,8 @@ typedef struct _vsi_nn_roi_align_param int32_t width_sample_num; } vsi_nn_roi_align_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h b/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h index 719e520..3fa6d91 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h @@ -26,6 +26,9 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif #define _VSI_NN_SIN_LOCAL_TENSOR_NUM 2 @@ -42,5 +45,8 @@ typedef struct _vsi_nn_sin_param vsi_nn_sin_lcl_data local; } vsi_nn_sin_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h index 6e12636..1accf6d 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h @@ -28,6 +28,10 @@ #include "vsi_nn_platform.h" #include "utils/vsi_nn_link_list.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_softmax_internal_lcl_data { vsi_nn_link_list_t link_list; @@ -40,7 +44,11 @@ typedef struct _vsi_nn_softmax_internal_param { vsi_nn_softmax_internal_lcl_data *data; float beta; + int32_t axis; } vsi_nn_softmax_internal_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h b/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h index 249ce2a..f28bfb4 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_squeeze_param { // Add parameters here @@ -34,5 +38,8 @@ typedef struct _vsi_nn_squeeze_param vx_uint32 axis_num; } vsi_nn_squeeze_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h b/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h index c75702a..cfd6d2b 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h @@ -25,6 +25,11 @@ #define _VSI_NN_OP_STACK_H #include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + #define VSI_NN_STACK_MAX_INPUTS (16) typedef struct _vsi_nn_stack_lcl_data @@ -63,5 +68,8 @@ typedef struct _vsi_nn_stack_param uint32_t axis; } vsi_nn_stack_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h index 3ce2d49..b60faff 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define _VSI_NN_TENSORADD_MEANSTDNORM_LOCAL_TENSOR_NUM 3 typedef struct _vsi_nn_tensoradd_meanstdnorm_lcl_data @@ -39,5 +43,8 @@ typedef struct _vsi_nn_tensor_add_mean_stddev_norm_param float eps; } vsi_nn_tensor_add_mean_stddev_norm_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h b/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h index 258d696..c885d3c 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h @@ -26,6 +26,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + #define _VSI_NN_TILE_LOCAL_TENSOR_NUM 2 typedef struct _vsi_nn_tile_lcl_data_t @@ -43,5 +47,8 @@ typedef struct _vsi_nn_tile_param uint32_t multiples_num; } vsi_nn_tile_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h index 11fc2c4..7ab6ff2 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h @@ -26,10 +26,17 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_topk_param { uint32_t k; } vsi_nn_topk_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h index a7281f9..985fe22 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" #include "vsi_nn_op_rnn.h" +#ifdef __cplusplus +extern "C" { +#endif + /* enum for inputs/outputs */ enum { @@ -48,5 +52,8 @@ typedef struct _vsi_nn_unidirectional_sequence_rnn_param vsi_nn_dtype_t internal_dtype[RNNCELL_QUANTIZE_PARAM_COUNT]; } vsi_nn_unidirectional_sequence_rnn_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h b/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h index 1ee8220..14360a6 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h @@ -25,6 +25,11 @@ #define _VSI_NN_OP_UNSTACK_H #include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + #define VSI_NN_UNSTACK_MAX_OUTPUTS (16) typedef struct _vsi_nn_unstack_lcl_data @@ -39,5 +44,8 @@ typedef struct _vsi_nn_unstack_param uint32_t axis; } vsi_nn_unstack_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h index f790da2..70f6eae 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h @@ -27,6 +27,10 @@ #include "vsi_nn_types.h" +#ifdef __cplusplus +extern "C" { +#endif + typedef struct _vsi_nn_upsamplescale_param { struct _upsamplescale_local_data_t* local; @@ -35,5 +39,8 @@ typedef struct _vsi_nn_upsamplescale_param float scale; } vsi_nn_upsamplescale_param; +#ifdef __cplusplus +} #endif +#endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h index a491adc..d7e5983 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h @@ -50,7 +50,9 @@ enum { D_F32 = VSI_NN_TYPE_FLOAT32, D_F64 = VSI_NN_TYPE_FLOAT64, D_BF16 = VSI_NN_TYPE_BFLOAT16, - D_BOOL8 = VSI_NN_TYPE_BOOL8 + D_BOOL8 = VSI_NN_TYPE_BOOL8, + D_I4 = VSI_NN_TYPE_INT4, + D_U4 = VSI_NN_TYPE_UINT4 }; /* short alias for qtype */ diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h index 973f2ac..f575892 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h @@ -72,6 +72,16 @@ OVXLIB_API uint32_t vsi_nn_TypeGetBytes const vsi_nn_type_e type ); +OVXLIB_API uint32_t vsi_nn_TypeGetBytesExt + ( + const vsi_nn_type_e type + ); + +OVXLIB_API uint32_t vsi_nn_TypeGetBits + ( + const vsi_nn_type_e type + ); + OVXLIB_API uint16_t vsi_nn_Fp32ToFp16 ( float in diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h index f017fe3..4586fa8 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -41,10 +41,12 @@ static inline vsi_bool type_is_integer ret = FALSE; switch( type ) { + case VSI_NN_TYPE_INT4: case VSI_NN_TYPE_INT8: case VSI_NN_TYPE_INT16: case VSI_NN_TYPE_INT32: case VSI_NN_TYPE_INT64: + case VSI_NN_TYPE_UINT4: case VSI_NN_TYPE_UINT8: case VSI_NN_TYPE_UINT16: case VSI_NN_TYPE_UINT32: @@ -67,6 +69,7 @@ static inline vsi_bool type_is_signed ret = FALSE; switch( type ) { + case VSI_NN_TYPE_INT4: case VSI_NN_TYPE_INT8: case VSI_NN_TYPE_INT16: case VSI_NN_TYPE_INT32: @@ -112,6 +115,38 @@ static inline uint32_t type_get_bytes } } /* type_get_bytes() */ +static inline uint32_t type_get_bits + ( + const vsi_nn_type_e type + ) +{ + switch( type ) + { + case VSI_NN_TYPE_INT4: + case VSI_NN_TYPE_UINT4: + return 4; + case VSI_NN_TYPE_INT8: + case VSI_NN_TYPE_UINT8: + case VSI_NN_TYPE_BOOL8: + return 8; + case VSI_NN_TYPE_INT16: + case VSI_NN_TYPE_UINT16: + case VSI_NN_TYPE_FLOAT16: + case VSI_NN_TYPE_BFLOAT16: + return 16; + case VSI_NN_TYPE_INT32: + case VSI_NN_TYPE_UINT32: + case VSI_NN_TYPE_FLOAT32: + return 32; + case VSI_NN_TYPE_INT64: + case VSI_NN_TYPE_UINT64: + case VSI_NN_TYPE_FLOAT64: + return 64; + default: + return 0; + } +} /* type_get_bits() */ + static inline void type_get_range ( vsi_nn_type_e type, @@ -123,8 +158,8 @@ static inline void type_get_range double from, to; from = 0.0; to = 0.0; - bits = type_get_bytes( type ) * 8; - if( type_is_integer( type ) ) + bits = type_get_bits( type ); + if( type_is_integer( type ) || bits > 0) { if( type_is_signed( type ) ) { @@ -240,6 +275,14 @@ static inline vsi_status integer_convert uint32_t src_sz = type_get_bytes( src_type ); uint32_t dest_sz = type_get_bytes( dest_type ); uint8_t* buffer = all_zeros; + if( src_sz == 0 ) + { + src_sz = 1; + } + if( dest_sz == 0) + { + dest_sz = 1; + } if( type_is_signed( src_type ) && (((int8_t *)src)[src_sz - 1] & 0x80) ) { buffer = all_ones; @@ -384,6 +427,8 @@ static inline vsi_status dtype_to_float32 case VSI_NN_TYPE_BFLOAT16: *dst = bfp16_to_fp32( *(int16_t *)src ); break; + case VSI_NN_TYPE_INT4: + case VSI_NN_TYPE_UINT4: case VSI_NN_TYPE_INT8: case VSI_NN_TYPE_BOOL8: case VSI_NN_TYPE_UINT8: @@ -397,6 +442,7 @@ static inline vsi_status dtype_to_float32 case VSI_NN_QNT_TYPE_DFP: *dst = dfp_to_fp32( src_value, src_dtype->fl, src_dtype->vx_type ); break; + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: *dst = affine_to_fp32( src_value, src_dtype->scale, src_dtype->zero_point, src_dtype->vx_type ); @@ -433,6 +479,8 @@ static inline vsi_status float32_to_dtype case VSI_NN_TYPE_BFLOAT16: *(int16_t *)dst = fp32_to_bfp16_rtne( src ); break; + case VSI_NN_TYPE_INT4: + case VSI_NN_TYPE_UINT4: case VSI_NN_TYPE_INT8: case VSI_NN_TYPE_BOOL8: case VSI_NN_TYPE_UINT8: @@ -446,6 +494,7 @@ static inline vsi_status float32_to_dtype case VSI_NN_QNT_TYPE_DFP: dst_value = fp32_to_dfp( src, dst_dtype->fl, dst_dtype->vx_type ); break; + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: dst_value = fp32_to_affine( src, dst_dtype->scale, dst_dtype->zero_point, dst_dtype->vx_type ); diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h index 1c8e36d..7aa984e 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -50,10 +50,13 @@ extern "C" { #define vsi_safe_release_tensor(_t) if(_t){vsi_nn_ReleaseTensor(&(_t)); _t = NULL;} -#define END_OF_VARIADIC_ARGUMENTS 0xbadcaffe +#define END_OF_VARIADIC_ARGUMENTS ((size_t)0xbadcaffebadcaffe) + #define FOREACH_ARGS(_args, _next, _arg_type) \ while(((_arg_type)((size_t)END_OF_VARIADIC_ARGUMENTS)) != (_next = va_arg(_args, _arg_type))) +#define BITS_PER_BYTE 8 + /*------------------------------------------- Functions -------------------------------------------*/ @@ -242,6 +245,21 @@ OVXLIB_API const char* vsi_nn_DescribeStatus vsi_status status ); +OVXLIB_API vsi_status vsi_nn_Pack4bitData + ( + vsi_nn_tensor_t * tensor, + uint8_t * src, + uint8_t * dest + ); + +OVXLIB_API vsi_status vsi_nn_Unpack4bitData + ( + vsi_nn_tensor_t * tensor, + uint8_t * src, + uint8_t * dest, + vsi_nn_type_e type + ); + vsi_size_t vsi_nn_compute_filter_shape ( vsi_nn_pad_e padding_type, @@ -261,6 +279,16 @@ void vsi_nn_compute_padding vsi_size_t * out_pad ); +void vsi_nn_compute_padding_3d + ( + const vsi_size_t in_shape[3], + const vsi_size_t ksize[3], + const uint32_t stride[3], + const uint32_t dilation[3], + const vsi_nn_pad_e pad_type, + vsi_size_t out_pad[6] + ); + void vsi_nn_compute_padding_conv1d ( vsi_size_t * in_shape, @@ -345,6 +373,31 @@ vsi_bool vsi_nn_is_same_type vsi_nn_tensor_t * src, vsi_nn_tensor_t * dst ); + +vsi_bool vsi_nn_is_broadcast_operaton + ( + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t * output + ); + +float vsi_nn_get_tensor_scale + ( + vsi_nn_tensor_t * tensor + ); + +int32_t vsi_nn_get_tensor_zero_point + ( + vsi_nn_tensor_t * tensor + ); + +void vsi_nn_get_tensor_clamp_min_max + ( + vsi_nn_tensor_t * input, + float *clampMin, + float *clampMax + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index cfeb25b..4374441 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -64,6 +64,8 @@ typedef struct _vsi_nn_hw_config_t uint32_t subGroupSize; #endif uint32_t use_40bits_va; + uint32_t support_stream_processor; + uint32_t sp_exec_count; } vsi_nn_hw_config_t; typedef struct _vsi_nn_runtime_option_t diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h index 8906a96..db38ecc 100644 --- a/src/tim/vx/internal/include/vsi_nn_feature_config.h +++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h @@ -1,3 +1,26 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the Software), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ /*****Auto generated header file, Please DO NOT modify manually!*****/ #ifndef _VSI_NN_FEATURE_CONFIG_H #define _VSI_NN_FEATURE_CONFIG_H diff --git a/src/tim/vx/internal/include/vsi_nn_log.h b/src/tim/vx/internal/include/vsi_nn_log.h index cf9c04c..fd7d37a 100644 --- a/src/tim/vx/internal/include/vsi_nn_log.h +++ b/src/tim/vx/internal/include/vsi_nn_log.h @@ -46,7 +46,7 @@ typedef enum _vsi_nn_log_level_e #define VSI_NN_MAX_DEBUG_BUFFER_LEN 1024 #define VSILOGE( fmt, ... ) \ - vsi_nn_LogMsg(VSI_NN_LOG_ERROR, "E [%s:%d]" fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) + vsi_nn_LogMsg(VSI_NN_LOG_ERROR, "E [%s:%s:%d]" fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__) #define VSILOGW( fmt, ... ) \ vsi_nn_LogMsg(VSI_NN_LOG_WARN, "W [%s:%d]" fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__) #define VSILOGI( fmt, ... ) \ diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index a6830f6..0278c4b 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -177,6 +177,11 @@ #include "ops/vsi_nn_op_gru.h" #include "ops/vsi_nn_op_grucell.h" #include "ops/vsi_nn_op_grucell_activation.h" +#include "ops/vsi_nn_op_reshape2.h" +#include "ops/vsi_nn_op_hard_sigmoid.h" +#include "ops/vsi_nn_op_conv3d.h" +#include "ops/vsi_nn_op_grucell_h_times_activation_r.h" +#include "ops/vsi_nn_op_grucell_activation_z_h.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" @@ -340,6 +345,11 @@ typedef union _vsi_nn_nn_param vsi_nn_gru_param gru; vsi_nn_grucell_param grucell; vsi_nn_grucell_activation_param grucell_activation; + vsi_nn_reshape2_param reshape2; + vsi_nn_hard_sigmoid_param hard_sigmoid; + vsi_nn_conv3d_param conv3d; + vsi_nn_grucell_h_times_activation_r_param grucell_h_times_activation_r; + vsi_nn_grucell_activation_z_h_param grucell_activation_z_h; uint8_t client_param[128]; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_platform.h b/src/tim/vx/internal/include/vsi_nn_platform.h index 6c00bd9..fc41e9f 100644 --- a/src/tim/vx/internal/include/vsi_nn_platform.h +++ b/src/tim/vx/internal/include/vsi_nn_platform.h @@ -24,6 +24,15 @@ #ifndef _VSI_NN_PLATFORM_H #define _VSI_NN_PLATFORM_H +#include "vsi_nn_feature_config.h" + +#ifdef VSI_40BIT_VA_SUPPORT +#ifdef VX_VA40_EXT_SUPPORT +#undef VX_VA40_EXT_SUPPORT +#endif +#define VX_VA40_EXT_SUPPORT 1 +#endif + #include #include #include diff --git a/src/tim/vx/internal/include/vsi_nn_post.h b/src/tim/vx/internal/include/vsi_nn_post.h new file mode 100644 index 0000000..61fe75f --- /dev/null +++ b/src/tim/vx/internal/include/vsi_nn_post.h @@ -0,0 +1,30 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_POST_H +#define _VSI_NN_POST_H + +#include "post/vsi_nn_post_fasterrcnn.h" +#include "post/vsi_nn_post_cmupose.h" + +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h index 846054f..7a33586 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor.h @@ -71,15 +71,17 @@ typedef enum typedef enum { /** none quantized */ - VSI_NN_QNT_TYPE_NONE = 0, + VSI_NN_QNT_TYPE_NONE = 0x0, /** dynamic fixed point */ - VSI_NN_QNT_TYPE_DFP = VX_QUANT_DYNAMIC_FIXED_POINT, + VSI_NN_QNT_TYPE_DFP = 0x1, /** affine asymmetric */ - VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC = VX_QUANT_AFFINE_SCALE, + VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC = 0x2, /** affine perchannel symmetric */ - VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC = 0x3,/*VX_QUANT_AFFINE_SCALE_PER_CHANNEL*/ + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC = 0x3, /** affine symmetric */ - VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC = VX_QUANT_AFFINE_SCALE, + VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC = 0x4, + /** affine perchannel asymmetric */ + VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC = 0x5, /** undefined type */ VSI_NN_QNT_TYPE_NA = 0xff, } vsi_nn_qnt_type_e; @@ -148,9 +150,11 @@ typedef struct vsi_nn_tensor_attr #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL vsi_memory_type_e vsi_memory_type; #endif +#if VX_STREAM_PROCESSOR_SUPPORT + vsi_bool is_dummy; +#endif } vsi_nn_tensor_attr_t; - /** * Tensor structure */ @@ -202,4 +206,3 @@ typedef struct _vsi_nn_tensor_rel #endif #endif - diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h index a88864d..1083d21 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h @@ -399,6 +399,14 @@ OVXLIB_API void vsi_nn_TransposeTensor vsi_size_t * as_shape ); +vx_tensor vsi_nn_safe_reshape_tensor + ( + vx_tensor tensor, + void * num_of_dims, + vsi_size_t sizes, + vsi_size_t size_of_shape_element + ); + OVXLIB_API void vsi_nn_PermuteTensor ( vsi_nn_graph_t * graph, @@ -728,6 +736,13 @@ vsi_bool vsi_nn_ConvertTensor vsi_nn_tensor_t* output ); +vsi_nn_tensor_t * vsi_nn_dropout_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + float rate + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h index 6e082f8..8aa3ca9 100644 --- a/src/tim/vx/internal/include/vsi_nn_types.h +++ b/src/tim/vx/internal/include/vsi_nn_types.h @@ -37,10 +37,6 @@ extern "C"{ #define inline __inline #endif -#if VX_VA40_EXT_SUPPORT -#define VSI_40BIT_VA_SUPPORT -#endif - #if (defined(_MSC_VER) || defined(__MINGW32)) #define SIZE_T_SPECIFIER "Iu" #define SSIZE_T_SPECIFIER "Id" @@ -167,12 +163,20 @@ typedef enum #else VSI_NN_TYPE_BOOL8 = 0x011, #endif +#ifdef VX_TENSOR_STRIDE_X_BITS_SUPPORT + VSI_NN_TYPE_INT4 = VX_TYPE_INT4, + VSI_NN_TYPE_UINT4 = VX_TYPE_UINT4, +#else + VSI_NN_TYPE_INT4 = 0x012, + VSI_NN_TYPE_UINT4 = 0x013, +#endif #ifdef VSI_BFLOAT16_SUPPORT VSI_NN_TYPE_BFLOAT16 = VX_TYPE_BFLOAT16, #else VSI_NN_TYPE_BFLOAT16 = 0x81A, #endif VSI_NN_TYPE_VDATA = VX_TYPE_USER_STRUCT_START + 0x1, + }vsi_nn_type_e; typedef int32_t vsi_nn_activation_e; enum diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index db3ba86..b0acac3 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 34 +#define VSI_NN_VERSION_PATCH 37 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c index 8036d0e..6a84a5e 100644 --- a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c @@ -225,12 +225,12 @@ static vsi_nn_kernel_node_t _setup float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); float rsEps = (float)(1.0f / sqrtf(eps)); float dimRatio = (float)(1.0f / (inputs[0]->attr.size[0])); - float input0Scale = inputs[0]->attr.dtype.scale; - float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; - float input1Scale = inputs[1]->attr.dtype.scale; - float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 0.0f : 1.0f / outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point; + float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale; + float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; + float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); int32_t width = (int32_t)inputs[0]->attr.size[0]; status = _query_kernel( kernel, inputs, outputs ); @@ -246,7 +246,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U8 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vsi_nn_kernel_node_set_border( node, &border ); VSI_ASSERT( status == VSI_SUCCESS ); diff --git a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c index 2223eb9..31a5223 100644 --- a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c @@ -232,46 +232,12 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - float input_scale = 1.0f; - float input_tail = 0; - float output_scale = 1.0f; - float output_zp = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input_scale; + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; float eps = vsi_nn_kernel_param_get_float32(params, "eps"); - if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) - { - input_scale = inputs[0]->attr.dtype.scale; - input_tail = (float)inputs[0]->attr.dtype.zero_point * inputs[0]->attr.dtype.scale; - } - else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP ) - { - if (inputs[0]->attr.dtype.fl > 0) - { - input_scale = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl))); - } - else - { - input_scale = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl)); - } - } - - if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) - { - output_scale = 1.0f / outputs[0]->attr.dtype.scale; - output_zp = (float)outputs[0]->attr.dtype.zero_point + 0.5f; - } - else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP ) - { - if (outputs[0]->attr.dtype.fl > 0) - { - output_scale = (float) ((int64_t)1 << outputs[0]->attr.dtype.fl); - } - else - { - output_scale = ((float) 1.0f / ((int64_t)1 << -outputs[0]->attr.dtype.fl)); - } - } - if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const) || ( inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 && inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 ) diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c index c611991..cc62fab 100644 --- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c @@ -240,10 +240,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_CLIP_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - float outputTail = (float)outputs[0]->attr.dtype.zero_point; - float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); vsi_bool is_use_u8_kernel = FALSE; float min_value = vsi_nn_kernel_param_get_float32( params, "min_value" ); float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c index 80f2f95..4be70d9 100644 --- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c @@ -337,10 +337,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t operation = 0; - float input0Scale = inputs[0]->attr.dtype.scale; - float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; - float input1Scale = inputs[1]->attr.dtype.scale; - float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; + float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale; + float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c index fdf9b40..f34393e 100644 --- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c @@ -239,29 +239,15 @@ static vsi_nn_kernel_node_t _setup float inv_scale_h = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" ); float inv_scale_w = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" ); vsi_bool is_use_u8_kernel = FALSE; - float input0Scale = 1.0f; - float input0Zp = 0.0f; - float input0Tail = 0.0f; - float input1Scale = 1.0f; - float input1Zp = 0.0f; - float input1Tail = 0.0f; + float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0Zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input0Tail = -input0Zp * input0Scale; + float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1Zp = (float)vsi_nn_get_tensor_zero_point(inputs[1]); + float input1Tail = -input1Zp * input1Scale; status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel ); - if ( inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) - { - input0Zp = (float)inputs[0]->attr.dtype.zero_point;; - input0Scale = inputs[0]->attr.dtype.scale; - input0Tail = -input0Zp * input0Scale; - } - - if ( inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) - { - input1Zp = (float)inputs[1]->attr.dtype.zero_point;; - input1Scale = inputs[1]->attr.dtype.scale; - input1Tail = -input1Zp * input1Scale; - } - if ( VSI_SUCCESS == status ) { size_t node_params_num = _DETECT_POST_BOX_F32_PARAM_NUM; diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c index 7dde9f8..5572007 100644 --- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -196,6 +196,7 @@ static vx_param_description_t kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define SCALAR_INPUT_SCALE (2) @@ -203,6 +204,7 @@ static vx_param_description_t kernel_param_def[] = #define SCALAR_OUTPUT_SCALE (4) #define SCALAR_OUTPUT_ZP (5) #define SCALAR_ALPHA (6) +#define SCALAR_BETA (7) #define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) /* @@ -318,11 +320,12 @@ static vsi_nn_kernel_node_t _setup vsi_size_t new_rank = 0; vsi_bool ret; - float inputScale = inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; - float outputScale = outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); + float beta = vsi_nn_kernel_param_get_float32( params, "beta" ); ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, @@ -363,6 +366,8 @@ static vsi_nn_kernel_node_t _setup graph, F32, &outputZP ); node_params[SCALAR_ALPHA] = vsi_nn_kernel_scalar_create( graph, F32, &alpha ); + node_params[SCALAR_BETA] = vsi_nn_kernel_scalar_create( + graph, F32, &beta ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); @@ -406,6 +411,11 @@ OnError: vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALPHA] ); } + if (node_params[SCALAR_BETA]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_BETA] ); + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/erf_cl.c b/src/tim/vx/internal/src/kernel/cl/erf_cl.c index 1cd573b..d6ef8d8 100644 --- a/src/tim/vx/internal/src/kernel/cl/erf_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/erf_cl.c @@ -238,10 +238,10 @@ static vsi_nn_kernel_node_t _setup vsi_bool ret = FALSE; vsi_bool image_2d = FALSE; - float inputScale = inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; - float outputScale = outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c index 4ceb1c2..1f0ba44 100644 --- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c @@ -187,12 +187,20 @@ static vsi_status _query_kernel if (F16 == in0_dtype) { - in0_dtype = F32; + in0_dtype = F32; + } + else if (I16 == in0_dtype) + { + in0_dtype = I32; } if (F16 == in1_dtype) { - in1_dtype = F32; + in1_dtype = F32; + } + else if (I16 == in1_dtype) + { + in1_dtype = I32; } if (F16 == out_dtype) @@ -254,12 +262,12 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - float outputTail = (float)outputs[0]->attr.dtype.zero_point; - float input0Scale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; - float input0Tail = (float)inputs[0]->attr.dtype.zero_point; - float input1Scale = inputs[1]->attr.dtype.scale == 0.0f ? 1.0f : inputs[1]->attr.dtype.scale; - float input1Tail = (float)inputs[1]->attr.dtype.zero_point; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]); vsi_bool is_use_u8_kernel = FALSE; outputScale = 1.0f / outputScale; diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c index af79d59..4612e4f 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c @@ -46,6 +46,7 @@ __BEGIN_DECLS typedef enum { + _error = -1, _1D = 0, _2D, _3D @@ -142,6 +143,10 @@ static vsi_status cal_gather_nd_tensor_reshape_size sizes[0] = block_size; sizes[1] = elementCnt / block_size; } + else if(coordDim == 4) + { + newDim[0] = 3; + } status = VSI_SUCCESS; } @@ -223,7 +228,7 @@ static vsi_status _query_kernel vsi_status status = VSI_FAILURE; vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; - vsi_nn_kernel_coord_type_e coord_type = _1D; + vsi_nn_kernel_coord_type_e coord_type = _error; uint32_t key = 0; int i = 0; @@ -237,7 +242,7 @@ static vsi_status _query_kernel { coord_type = _2D; } - else if(coord_dim == 3) + else if(coord_dim == 3 || coord_dim == 4) { coord_type = _3D; } diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c index 52110c8..49ccd23 100644 --- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c @@ -522,12 +522,10 @@ static vsi_nn_kernel_node_t _setup vsi_size_t width = inputs[0]->attr.size[0]; vsi_size_t height = inputs[0]->attr.size[1]; int32_t group_stride = 1; - float input_zp = 0; - float input_scale = 1.0f; - int32_t input_fl = 0; - float output_zp = 0; - float output_scale = 1.0f; - int32_t output_fl = 0; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); float rSpaceOrg = 1.0f / (width * height); float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size); @@ -549,44 +547,6 @@ static vsi_nn_kernel_node_t _setup height = is2D_flg > 0 ? 1 : new_shape[1]; group_stride = (int32_t)(((width + 15) / 16) * 4); - if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - input_zp = (float)inputs[0]->attr.dtype.zero_point; - input_scale = inputs[0]->attr.dtype.scale; - } - else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - input_fl = inputs[0]->attr.dtype.fl; - if (input_fl > 0) - { - input_scale = (1.0f / ((float) ((int64_t)1 << input_fl))); - } - else - { - input_scale = ((float) ((int64_t)1 << -input_fl)); - } - input_zp = 0.0f; - } - - if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - output_zp = (float)outputs[0]->attr.dtype.zero_point; - output_scale = 1.0f / outputs[0]->attr.dtype.scale; - } - else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - output_fl = outputs[0]->attr.dtype.fl; - if (output_fl > 0) - { - output_scale = (float)((int64_t)1 << output_fl); - } - else - { - output_scale = (1.0f / (float)((int64_t)1 << -output_fl)); - } - output_zp = 0.0f; - } - for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) { ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL ); @@ -757,4 +717,3 @@ final: __END_DECLS REGISTER_BACKEND_CL( group_norm, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c new file mode 100644 index 0000000..e2b6964 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c @@ -0,0 +1,281 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum _grucell_nn_activation_type_e +{ + SIGMOID = VSI_NN_ACT_SIGMOID, + HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, +}grucell_nn_activation_type_e; + +#define _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE "grucell_activation_z_h" + +// Add kernel hashtable here +#define GRUCELL_ACTIVATION_Z_H_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 )) + +#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + { GRUCELL_ACTIVATION_Z_H_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \ + CVIVANTE_NAMESPACE("cl.grucell_activation_z_h_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \ + _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_activation_z_h_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, F32, U8, SIGMOID ), + PACK_KERNEL_MAP( I32, F32, I32, SIGMOID ), + PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM _cnt_of_array( _grucell_activation_z_h_kernel_param_def ) +#define SCALAR_INPUT_SCALE (7) +#define SCALAR_INPUT_TAIL (8) +#define SCALAR_OUTPUT_SCALE (9) +#define SCALAR_OUTPUT_ZP (10) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input = NULL; + vsi_nn_kernel_tensor_attr_t* input_attr = NULL; + + input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_HSTATE]; + + input_attr = vsi_nn_kernel_tensor_attr_create( input ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((input_attr->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (input_attr->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + + if (input_attr) + { + vsi_nn_kernel_tensor_attr_release( &input_attr ); + } + + return status; +} /* _grucell_activation_z_h_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t recurrent_activation + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e hstate_dtype; + vsi_nn_kernel_dtype_e fc_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _grucell_activation_z_h_kernel_map; + size_t kernel_map_size = _cnt_of_array( _grucell_activation_z_h_kernel_map ); + vx_param_description_t * param_def = _grucell_activation_z_h_kernel_param_def; + vx_kernel_initialize_f initializer = _grucell_activation_z_h_initializer; + + uint32_t key = 0; + uint32_t i = 0; + + hstate_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type ); + fc_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type ); + + if (F16 == hstate_dtype) + { + hstate_dtype = F32; + } + else if (I8 == hstate_dtype || I16 == hstate_dtype) + { + hstate_dtype = I32; + } + + if (F16 == fc_dtype) + { + fc_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + else if (I8 == out_dtype || I16 == out_dtype) + { + out_dtype = I32; + } + + key = GRUCELL_ACTIVATION_Z_H_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _grucell_activation_z_h_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_Z_H_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" ); + int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + float input_scale = vsi_nn_get_tensor_scale(inputs[GRUCELL_ACT_Z_H_HSTATE]); + float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[GRUCELL_ACT_Z_H_HSTATE]) * input_scale; + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]); + + if( activation != VSI_NN_ACT_TANH ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, recurrent_activation ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input_scale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input_tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &output_scale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &output_zp ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( grucell_activation_z_h, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c new file mode 100644 index 0000000..3912b95 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c @@ -0,0 +1,259 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum _grucell_nn_activation_type_e +{ + SIGMOID = VSI_NN_ACT_SIGMOID, + HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, +} grucell_nn_activation_type_e; + +#define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE "grucell_h_times_activation_r" +#define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_NAME CVIVANTE_NAMESPACE("cl.grucell_h_times_activation_r") + +// Add kernel hashtable here +#define GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 )) +#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + { GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \ + CVIVANTE_NAMESPACE("cl.grucell_h_times_activation_r_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \ + _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, F32, F32, SIGMOID ), + PACK_KERNEL_MAP( I32, F32, F32, SIGMOID ), + PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _grucell_h_times_activation_r_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def ) +#define SCALAR_INPUT_SCALE (4) +#define SCALAR_INPUT_TAIL (5) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t output = NULL; + vsi_nn_kernel_tensor_attr_t* output_attr; + + output = (vsi_nn_kernel_tensor_t)param[3]; + + output_attr = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_attr->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_attr->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release( &output_attr ); + } + + return status; +} /* _grucell_h_times_activation_r_initializer() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t recurrent_activation + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e hstate_dtype; + vsi_nn_kernel_dtype_e fc_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _grucell_h_times_activation_r_kernel_map; + size_t kernel_map_size = _cnt_of_array( _grucell_h_times_activation_r_kernel_map ); + vx_param_description_t * param_def = _grucell_h_times_activation_r_kernel_param_def; + vx_kernel_initialize_f initializer = _grucell_h_times_activation_r_initializer; + + uint32_t key = 0; + uint32_t i = 0; + + hstate_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + fc_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == hstate_dtype) + { + hstate_dtype = F32; + } + else if (I8 == hstate_dtype || I16 == hstate_dtype) + { + hstate_dtype = I32; + } + + if (F16 == fc_dtype) + { + fc_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + else if (I8 == out_dtype || I16 == out_dtype) + { + out_dtype = I32; + } + + key = GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[0]) * input_scale; + + status = _query_kernel( kernel, inputs, outputs, recurrent_activation ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input_scale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input_tail ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( grucell_h_times_activation_r, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c new file mode 100644 index 0000000..a18b112 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c @@ -0,0 +1,282 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum _grucell_nn_activation_type_e +{ + SIGMOID = VSI_NN_ACT_SIGMOID, + HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, +}grucell_nn_activation_type_e; + +#define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation" + +// Add kernel hashtable here +#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 )) +#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \ + CVIVANTE_NAMESPACE("cl.grucell_reset_after_activation_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \ + _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, F32, U8, SIGMOID ), + PACK_KERNEL_MAP( I32, F32, I32, SIGMOID ), + PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _grucell_reset_after_activation_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM _cnt_of_array( _grucell_reset_after_activation_kernel_param_def ) +#define SCALAR_INPUT_SCALE (9) +#define SCALAR_INPUT_TAIL (10) +#define SCALAR_OUTPUT_SCALE (11) +#define SCALAR_OUTPUT_ZP (12) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input = NULL; + vsi_nn_kernel_tensor_attr_t* input_attr = NULL; + + input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_H_STATE]; + + input_attr = vsi_nn_kernel_tensor_attr_create( input ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((input_attr->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (input_attr->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + + if (input_attr) + { + vsi_nn_kernel_tensor_attr_release( &input_attr ); + } + + return status; +} /* _grucell_reset_after_activation_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t recurrent_activation + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e hstate_dtype; + vsi_nn_kernel_dtype_e fc_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _grucell_reset_after_activation_kernel_map; + size_t kernel_map_size = _cnt_of_array( _grucell_reset_after_activation_kernel_map ); + vx_param_description_t * param_def = _grucell_reset_after_activation_kernel_param_def; + vx_kernel_initialize_f initializer = _grucell_reset_after_activation_initializer; + + uint32_t key = 0; + uint32_t i = 0; + + hstate_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type ); + fc_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type ); + + if (F16 == hstate_dtype) + { + hstate_dtype = F32; + } + else if (I8 == hstate_dtype || I16 == hstate_dtype) + { + hstate_dtype = I32; + } + + if (F16 == fc_dtype) + { + fc_dtype = F32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + else if (I8 == out_dtype || I16 == out_dtype) + { + out_dtype = I32; + } + + key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _grucell_reset_after_activation_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" ); + int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + float input_scale = vsi_nn_get_tensor_scale(inputs[GRUCELL_ACT_H_STATE]); + float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[GRUCELL_ACT_H_STATE]) * input_scale; + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_OUT_OUTPUT]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_OUT_OUTPUT]); + + if( activation != VSI_NN_ACT_TANH ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, recurrent_activation ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &input_scale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( + graph, F32, &input_tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( + graph, F32, &output_scale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( + graph, F32, &output_zp ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( grucell_reset_after_activation, _setup ) \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c index 8b78ced..929c812 100644 --- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c @@ -413,53 +413,13 @@ static vsi_nn_kernel_node_t _setup size_t width = inputs[0]->attr.size[0]; size_t height = inputs[0]->attr.size[1]; int32_t group_num = (int32_t)(width + 15) / 16; - int32_t input_zp = 0; - float input_scale = 1.0f; - int32_t input_fl = 0; - int32_t output_zp = 0; - float output_scale = 1.0f; - int32_t output_fl = 0; + int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + int32_t output_zp = vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); float in_fl_scale = 1.0f, out_fl_scale = 1.0; float dim_ratio = (float)1.0 / (float)(width * height); - if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - input_zp = inputs[0]->attr.dtype.zero_point; - input_scale = inputs[0]->attr.dtype.scale; - } - else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - input_fl = inputs[0]->attr.dtype.fl; - if (input_fl > 0) - { - in_fl_scale = (1.0f / ((float) ((int64_t)1 << input_fl))); - } - else - { - in_fl_scale = ((float) ((int64_t)1 << -input_fl)); - } - input_zp = 0; - } - - if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - output_zp = outputs[0]->attr.dtype.zero_point; - output_scale = 1.0f / outputs[0]->attr.dtype.scale; - } - else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - output_fl = outputs[0]->attr.dtype.fl; - if (output_fl > 0) - { - out_fl_scale = (float)((int64_t)1 << output_fl); - } - else - { - out_fl_scale = (1.0f / (float)((int64_t)1 << -output_fl)); - } - output_zp = 0; - } - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { @@ -674,4 +634,3 @@ final: __END_DECLS REGISTER_BACKEND_CL( instance_norm, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c index 2250a8d..e516df5 100644 --- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c @@ -259,10 +259,10 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; int32_t axis = 0; vsi_size_t axis_size = 0; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - float outputTail = (float)outputs[0]->attr.dtype.zero_point; - float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); float epsilon = (float)10e-12; float rsEps = 1.0f / sqrtf(epsilon); vsi_bool is_use_u8_kernel = FALSE; diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c index 7824a1e..20f3ab0 100644 --- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c @@ -233,55 +233,16 @@ static vsi_nn_kernel_node_t _setup vsi_size_t width = inputs[0]->attr.size[0]; vsi_size_t height = inputs[0]->attr.size[1]; - int32_t input_fl = 0; - float input_zp = 0.0f; - float input_scale = 1.0f; - int32_t output_fl = 0; - float output_zp = 0.0f; - float output_scale = 1.0f; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); float e2InScale = 1.0f, scale_inOut = 1.0f; float dim_ratio = (float)1.0 / (float)(width); float sumZpScale = 0.0f; float zp2ScaleE2 = 0.0f; float sumZpScaleE2 = 0.0f; - if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - input_zp = (float)inputs[0]->attr.dtype.zero_point; - input_scale = inputs[0]->attr.dtype.scale; - } - else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - input_fl = inputs[0]->attr.dtype.fl; - if (input_fl > 0) - { - input_scale = (1.0f / ((float) ((int64_t)1 << input_fl))); - } - else - { - input_scale = ((float) ((int64_t)1 << -input_fl)); - } - input_zp = 0.0f; - } - - if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - output_zp = (float)outputs[0]->attr.dtype.zero_point; - output_scale = 1.0f / outputs[0]->attr.dtype.scale; - } - else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - output_fl = outputs[0]->attr.dtype.fl; - if (output_fl > 0) - { - output_scale = (float)((int64_t)1 << output_fl); - } - else - { - output_scale = (1.0f / (float)((int64_t)1 << -output_fl)); - } - output_zp = 0.0f; - } scale_inOut = input_scale * output_scale; e2InScale = input_scale * input_scale; sumZpScale = width * input_zp * input_scale; @@ -392,4 +353,3 @@ final: __END_DECLS REGISTER_BACKEND_CL( layer_norm, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c index 81b0d1b..311de97 100644 --- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c @@ -239,10 +239,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t axis = 0; float beta = 0; - float inputScale = - inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ? inputs[0]->attr.dtype.scale : 1.0f; - float outputScale = 1.0f / outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; float scaleValue = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); axis = vsi_nn_kernel_param_get_int32(params, "axis"); diff --git a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c index 667cca5..a7bdb2c 100644 --- a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c @@ -1444,7 +1444,6 @@ static vsi_status _query_kernel } return status; - } /* _query_kernel() */ @@ -1511,65 +1510,57 @@ static vsi_nn_kernel_node_t _setup if (inputs[LSTMUNIT_ACT_INPUT_FC_I] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.vx_type) { - scale_val[0] = inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.scale; - tail_val[0] = \ - -inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.zero_point; + scale_val[0] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_INPUT_FC_I]); + tail_val[0] = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_INPUT_FC_I]) * scale_val[0]; } if (inputs[LSTMUNIT_ACT_INPUT_FC_F] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.vx_type) { - scale_val[1] = inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.scale; - tail_val[1] = \ - -inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.zero_point; + scale_val[1] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_INPUT_FC_F]); + tail_val[1] = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_INPUT_FC_F]) * scale_val[1]; } if (inputs[LSTMUNIT_ACT_INPUT_FC_C] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.vx_type) { - scale_val[2] = inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.scale; - tail_val[2] = \ - -inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.zero_point; + scale_val[2] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_INPUT_FC_C]); + tail_val[2] = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_INPUT_FC_C]) * scale_val[2]; } if (inputs[LSTMUNIT_ACT_INPUT_FC_O] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.vx_type) { - scale_val[3] = inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.scale; - tail_val[3] = \ - -inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.zero_point; + scale_val[3] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_INPUT_FC_O]); + tail_val[3] = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_INPUT_FC_O]) * scale_val[3]; } if (inputs[LSTMUNIT_ACT_HSTATE_FC_I] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.vx_type) { - scale_val[4] = inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.scale; - tail_val[4] = \ - -inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.zero_point; + scale_val[4] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_HSTATE_FC_I]); + tail_val[4] = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_HSTATE_FC_I]) * scale_val[4]; } if (inputs[LSTMUNIT_ACT_HSTATE_FC_F] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.vx_type) { - scale_val[5] = inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.scale; - tail_val[5] = \ - -inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.zero_point; + scale_val[5] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_HSTATE_FC_F]); + tail_val[5] = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_HSTATE_FC_F]) * scale_val[5]; } if (inputs[LSTMUNIT_ACT_HSTATE_FC_C] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.vx_type) { - scale_val[6] = inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.scale; - tail_val[6] = \ - -inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.zero_point; + scale_val[6] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_HSTATE_FC_C]); + tail_val[6] = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_HSTATE_FC_C]) * scale_val[6]; } if (inputs[LSTMUNIT_ACT_HSTATE_FC_O] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.vx_type) { - scale_val[7] = inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.scale; - tail_val[7] = \ - -inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.zero_point; + scale_val[7] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_HSTATE_FC_O]); + tail_val[7] = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_HSTATE_FC_O]) * scale_val[7]; } if (outputs[LSTMUNIT_ACT_OUTPUT] && VSI_NN_TYPE_UINT8 == outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.vx_type) { - scale_val[8] = 1.0f / outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.scale; - tail_val[8] = (float)(outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.zero_point); + scale_val[8] = 1.0f / vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_OUTPUT]); + tail_val[8] = (float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_OUTPUT]); } if( VSI_SUCCESS == status) @@ -1645,4 +1636,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( lstmunit_activation, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c index e28a548..35eb757 100644 --- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c @@ -253,12 +253,12 @@ static vsi_nn_kernel_node_t _setup vsi_size_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1; uint32_t ac2zero = 0; uint32_t bc2zero = 0; - float scale_a = 1.0f; - float zp_a = 0; - float scale_b = 1.0f; - float zp_b = 0; - float scale_out = 1.0f; - float zp_out = 0; + float scale_a = vsi_nn_get_tensor_scale(inputs[0]); + float zp_a = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float scale_b = vsi_nn_get_tensor_scale(inputs[1]); + float zp_b = (float)vsi_nn_get_tensor_zero_point(inputs[1]); + float scale_out = vsi_nn_get_tensor_scale(outputs[0]); + float zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]); if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) @@ -272,60 +272,6 @@ static vsi_nn_kernel_node_t _setup transFlg = 2; } - if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - if (inputs[0]->attr.dtype.fl > 0) - { - scale_a = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl))); - } - else - { - scale_a = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl)); - } - zp_a = 0; - } - else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - zp_a = (float)inputs[0]->attr.dtype.zero_point; - scale_a = inputs[0]->attr.dtype.scale; - } - - if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - if (inputs[1]->attr.dtype.fl > 0) - { - scale_b = (1.0f / ((float) ((int64_t)1 << inputs[1]->attr.dtype.fl))); - } - else - { - scale_b = ((float) ((int64_t)1 << -inputs[1]->attr.dtype.fl)); - } - zp_b = 0; - } - else if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - zp_b = (float)inputs[1]->attr.dtype.zero_point; - scale_b = inputs[1]->attr.dtype.scale; - } - - if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - if (outputs[0]->attr.dtype.fl > 0) - { - scale_out = (float)((int64_t)1 << outputs[0]->attr.dtype.fl); - } - else - { - scale_out = (1.0f / (float)((int64_t)1 << -outputs[0]->attr.dtype.fl)); - } - zp_out = 0; - } - else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - zp_out = (float)outputs[0]->attr.dtype.zero_point; - scale_out = outputs[0]->attr.dtype.scale; - } - if (transposeA) { K = inputs[0]->attr.size[1]; @@ -389,4 +335,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( matrixmul, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c index 98a175f..322bd22 100644 --- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c @@ -239,12 +239,12 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - float input0Scale = inputs[0]->attr.dtype.scale; - float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; - float input1Scale = inputs[1]->attr.dtype.scale; - float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; - float outputScale = outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale; + float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; @@ -294,4 +294,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( maximum, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c index a730f0b..40b9977 100644 --- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c @@ -238,12 +238,12 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - float input0Scale = inputs[0]->attr.dtype.scale; - float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; - float input1Scale = inputs[1]->attr.dtype.scale; - float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; - float outputScale = outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale; + float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; @@ -293,4 +293,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( minimum, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c index 0a04c13..ed420ad 100644 --- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c @@ -372,25 +372,12 @@ static vsi_nn_kernel_node_t _setup vsi_size_t width = inputs[0]->attr.size[0]; vsi_size_t height = inputs[0]->attr.size[1]; vsi_size_t chn = inputs[0]->attr.size[2]; - int32_t input_zp = inputs[0]->attr.dtype.zero_point; - float input_scale = inputs[0]->attr.dtype.scale; + int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); float dim_ratio = (float)1.0 / (float)(width * height); axis_num = (int32_t)axis_num_temp; - if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - if (inputs[0]->attr.dtype.fl > 0) - { - input_scale = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl))); - } - else - { - input_scale = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl)); - } - input_zp = 0; - } - if (axis_num == 1 && axis[0] == 0) { dim_ratio = (float)1.0 / (float)(width); @@ -453,7 +440,7 @@ static vsi_nn_kernel_node_t _setup if ( node ) { uint32_t index = 0; - int32_t constant_value = 0; + int32_t constant_value = vsi_nn_get_tensor_zero_point(inputs[0]); /* Pass parameters to node. */ if (reshape_tensors[0]) { @@ -494,10 +481,6 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_tensor_release( &node_params[2] ); } - if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - constant_value = inputs[0]->attr.dtype.zero_point; - } status = set_constant_border(node, constant_value); CHECK_STATUS(status); } diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c index 3aa26fd..33b575f 100644 --- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c @@ -239,8 +239,8 @@ static vsi_nn_kernel_node_t _setup float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" ); float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" ); int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); - float inputScale = inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale; out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); diff --git a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c index 73b264e..558a1e0 100644 --- a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c @@ -242,10 +242,10 @@ static vsi_nn_kernel_node_t _setup int32_t pad_x = 0; int32_t pad_y = 0; vsi_bool image_2d = FALSE; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - float outputTail = (float)outputs[0]->attr.dtype.zero_point; - float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); float scale_value = 1.0f; float tail_value = 0.0f; vsi_bool is_use_u8_kernel = FALSE; @@ -303,7 +303,6 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL] ); } - } } return node; @@ -312,4 +311,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( poolwithargmax, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c index 7bbfbec..609c90e 100644 --- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c @@ -240,12 +240,12 @@ static vsi_nn_kernel_node_t _setup vsi_size_t new_rank = 0; vsi_bool ret; - float input0Scale = inputs[0]->attr.dtype.scale; - float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale; - float input1Scale = inputs[1]->attr.dtype.scale; - float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; - float outputScale = outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale; + float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); int32_t is_per_channel_alpha = 0; is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha"); @@ -257,6 +257,11 @@ static vsi_nn_kernel_node_t _setup outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; + if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + outputZP += 0.5f; + } + ret = vsi_nn_kernel_optimize_eltwise_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, inputs[1]->attr.size, inputs[1]->attr.dim_num, @@ -329,4 +334,3 @@ final: __END_DECLS REGISTER_BACKEND_CL( prelu, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c index 3e3d4bd..05a8674 100644 --- a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c @@ -81,7 +81,6 @@ static const _kernel_map_type _reducemax_internal_kernel_map[] = HASH_REDUCEMAX_KERNELS_2D( 1, F32, F32 ) HASH_REDUCEMAX_KERNELS_2D( 1, I32, I32 ) HASH_REDUCEMAX_KERNELS_2D( 1, U8, U8 ) - }; @@ -236,10 +235,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; int32_t axis = 0; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point; - float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); inputScale = inputScale / outputScale; inputTail = outputZP - inputTail * inputScale; @@ -281,4 +280,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( reducemax_internal, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c index 1658fa4..50a5025 100644 --- a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c @@ -225,10 +225,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; int32_t axis = 0; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point; - float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); inputScale = inputScale / outputScale; inputTail = outputZP - inputTail * inputScale; @@ -270,4 +270,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( reducemin_internal, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c index b1feb05..8d1b7c0 100644 --- a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c @@ -247,10 +247,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; int32_t axis = 0; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - float outputTail = (float)outputs[0]->attr.dtype.zero_point; - float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); vsi_bool is_use_u8_kernel = FALSE; outputScale = 1.0f / outputScale; @@ -304,4 +304,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( reduceprod_internal, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c index d08676c..8cfd331 100644 --- a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c @@ -225,7 +225,6 @@ static vsi_status _query_kernel } return status; - } /* _query_kernel() */ @@ -244,10 +243,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_RELU_KERAS_QUANT_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; - float outputScale = 1.0f; - float outputTail = 0.0f; - float inputScale = 1.0f; - float inputTail = 0.0f; + float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = -1 * (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale; vsi_bool is_use_u8_kernel = FALSE; float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); @@ -260,19 +259,6 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if (VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == inputs[0]->attr.dtype.qnt_type) - { - inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; - inputTail = -((float)inputs[0]->attr.dtype.zero_point * inputScale); - } - - if (VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == outputs[0]->attr.dtype.qnt_type) - { - outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - outputScale = 1.0f / outputScale; - outputTail = (float)outputs[0]->attr.dtype.zero_point; - } - image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel ); @@ -316,10 +302,8 @@ static vsi_nn_kernel_node_t _setup } return node; - } /* _setup() */ __END_DECLS REGISTER_BACKEND_CL( relu_keras, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c index 3a189f4..fda7acd 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c @@ -235,11 +235,11 @@ static vsi_nn_kernel_node_t _setup int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); vsi_size_t in_width = inputs[0]->attr.size[0]; vsi_size_t out_width = outputs[0]->attr.size[0]; - float input_zp = (float)inputs[0]->attr.dtype.zero_point; - float input_scale = inputs[0]->attr.dtype.scale; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); float input_tail = -(input_zp * input_scale); - float output_zp = (float)outputs[0]->attr.dtype.zero_point; - float output_scale = (0 == outputs[0]->attr.dtype.scale) ? 1.0f : 1.0f / outputs[0]->attr.dtype.scale; + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); float half_pixel_value = 0.0f; float scale_factor_x = 0.0f; vsi_bool is_use_u8_kernel = FALSE; @@ -302,4 +302,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( resize_1d_bilinear, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c index e406397..eef5bec 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c @@ -235,11 +235,10 @@ static vsi_nn_kernel_node_t _setup int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); vsi_size_t in_width = inputs[0]->attr.size[0]; vsi_size_t out_width = outputs[0]->attr.size[0]; - float input_zp = (float)inputs[0]->attr.dtype.zero_point; - float input_scale = inputs[0]->attr.dtype.scale; - float output_scale = (0 == outputs[0]->attr.dtype.scale) ? \ - input_scale : input_scale / outputs[0]->attr.dtype.scale; - float output_tail = (float)outputs[0]->attr.dtype.zero_point - input_zp * output_scale; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float output_scale = input_scale / vsi_nn_get_tensor_scale(outputs[0]); + float output_tail = (float)vsi_nn_get_tensor_zero_point(outputs[0]) - input_zp * output_scale; float half_pixel_value = 0.0f; float round_value = 0.0f; float scale_factor_x = 0.0f; @@ -309,4 +308,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( resize_1d_nearest, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c index 320a6d9..a9c0285 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c @@ -213,7 +213,6 @@ static vsi_status _query_kernel } return status; - } /* _query_kernel() */ @@ -237,11 +236,11 @@ static vsi_nn_kernel_node_t _setup vsi_size_t in_height = inputs[0]->attr.size[1]; vsi_size_t out_width = outputs[0]->attr.size[0]; vsi_size_t out_height = outputs[0]->attr.size[1]; - float input_zp = (float)inputs[0]->attr.dtype.zero_point; - float input_scale = inputs[0]->attr.dtype.scale; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); float input_tail = -(input_zp * input_scale); - float output_zp = (float)outputs[0]->attr.dtype.zero_point; - float output_scale = (0 == outputs[0]->attr.dtype.scale) ? 1.0f : 1.0f / outputs[0]->attr.dtype.scale; + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); float half_pixel_value = 0.0f; float scale_factor_x = 0.0f; float scale_factor_y = 0.0f; @@ -313,10 +312,8 @@ static vsi_nn_kernel_node_t _setup } return node; - } /* _setup() */ __END_DECLS REGISTER_BACKEND_CL( resize_bilinear, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c index 588b527..d61abcf 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c @@ -241,11 +241,10 @@ static vsi_nn_kernel_node_t _setup vsi_size_t in_height = inputs[0]->attr.size[1]; vsi_size_t out_width = outputs[0]->attr.size[0]; vsi_size_t out_height = outputs[0]->attr.size[1]; - float input_zp = (float)inputs[0]->attr.dtype.zero_point; - float input_scale = inputs[0]->attr.dtype.scale; - float output_scale = (0 == outputs[0]->attr.dtype.scale) ? \ - input_scale : input_scale / outputs[0]->attr.dtype.scale; - float output_tail = (float)outputs[0]->attr.dtype.zero_point - input_zp * output_scale; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float output_scale = input_scale / vsi_nn_get_tensor_scale(outputs[0]); + float output_tail = (float)vsi_nn_get_tensor_zero_point(outputs[0]) - input_zp * output_scale; float half_pixel_value = 0.0f; float round_value = 0.0f; float scale_factor_x = 0.0f; @@ -327,4 +326,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( resize_nearest, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/select_cl.c b/src/tim/vx/internal/src/kernel/cl/select_cl.c index 9c00e23..53b1fcd 100644 --- a/src/tim/vx/internal/src/kernel/cl/select_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c @@ -240,12 +240,12 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_SELECT_PARAM_NUM] = {NULL}; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point; - float input0Scale = inputs[1]->attr.dtype.scale == 0.0f ? 1.0f : inputs[1]->attr.dtype.scale; - float input0Tail = (float)inputs[1]->attr.dtype.zero_point; - float input1Scale = inputs[2]->attr.dtype.scale == 0.0f ? 1.0f : inputs[2]->attr.dtype.scale; - float input1Tail = (float)inputs[2]->attr.dtype.zero_point; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float input0Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]); + float input1Scale = vsi_nn_get_tensor_scale(inputs[2]); + float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[2]); input0Scale = input0Scale / outputScale; input1Scale = input1Scale / outputScale; @@ -289,4 +289,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( select, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c index 58fed76..d65200d 100644 --- a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c @@ -246,14 +246,12 @@ static vsi_nn_kernel_node_t _setup int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" ); vsi_nn_kernel_node_t node = NULL; int32_t is2Dflg = 0; - float input_zp = 0; - float input_scale = 1.0f; - int32_t output_zp = 0; - float output_scale = 1.0f; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + int32_t output_zp = vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); float input_zpScale = 0; float outputVal1 = 1.0f; - int32_t input_fl = 0; - int32_t output_fl = 0; if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) @@ -269,43 +267,6 @@ static vsi_nn_kernel_node_t _setup rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2); rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 4); - if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - input_zp = (float)inputs[0]->attr.dtype.zero_point; - input_scale = inputs[0]->attr.dtype.scale; - } - else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - input_fl = inputs[0]->attr.dtype.fl; - if (input_fl > 0) - { - input_scale = (1.0f / ((float) ((int64_t)1 << input_fl))); - } - else - { - input_scale = ((float) ((int64_t)1 << -input_fl)); - } - input_zp = 0.0f; - } - - if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) - { - output_zp = outputs[0]->attr.dtype.zero_point; - output_scale = 1.0f / outputs[0]->attr.dtype.scale; - } - else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - output_fl = outputs[0]->attr.dtype.fl; - if (output_fl > 0) - { - output_scale = (float)((int64_t)1 << output_fl); - } - else - { - output_scale = (1.0f / (float)((int64_t)1 << -output_fl)); - } - output_zp = 0; - } input_zpScale = input_scale * input_zp; outputVal1 = output_scale + (float)output_zp; @@ -351,4 +312,3 @@ final: __END_DECLS REGISTER_BACKEND_CL( sequence_mask, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/slice_cl.c b/src/tim/vx/internal/src/kernel/cl/slice_cl.c index ed83e5f..4900bb1 100644 --- a/src/tim/vx/internal/src/kernel/cl/slice_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c @@ -240,10 +240,10 @@ static vsi_nn_kernel_node_t _setup int32_t i = 0; vsi_size_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; vsi_size_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; - float inputScale = inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; - float outputScale = outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f; outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; diff --git a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c index 65b09ee..7c7a59a 100644 --- a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c @@ -208,50 +208,13 @@ static vsi_nn_kernel_node_t _setup int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" ); int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0; - float inputScale = inputs[0]->attr.dtype.scale; - int32_t inputZp = inputs[0]->attr.dtype.zero_point; - float outputScale = outputs[0]->attr.dtype.scale; - int32_t outputZp = outputs[0]->attr.dtype.zero_point; + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + int32_t inputZp = vsi_nn_get_tensor_zero_point(inputs[0]); + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + int32_t outputZp = vsi_nn_get_tensor_zero_point(outputs[0]); float scaleInOut = 1.0f; float zpInOut = 0.0f; - if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - int32_t input_fl = inputs[0]->attr.dtype.fl; - if (input_fl > 0) - { - inputScale = (1.0f / ((float) ((int64_t)1 << input_fl))); - } - else - { - inputScale = ((float) ((int64_t)1 << -input_fl)); - } - inputZp = 0; - } - else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE) - { - inputScale = 1.0f; - inputZp = 0; - } - - if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP) - { - int32_t output_fl = outputs[0]->attr.dtype.fl; - if (output_fl > 0) - { - outputScale = (1.0f / ((float) ((int64_t)1 << output_fl))); - } - else - { - outputScale = ((float) ((int64_t)1 << -output_fl)); - } - outputZp = 0; - } - else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE) - { - outputScale = 1.0f; - outputZp = 0; - } scaleInOut = inputScale / outputScale; zpInOut = outputZp - inputZp * scaleInOut; @@ -295,4 +258,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( space2depth_internal, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c index 4a4283e..4c3f206 100644 --- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c @@ -279,10 +279,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t swish_type = vsi_nn_kernel_param_get_int32( params, "type" ); float beta = 1.0f; - float inputScale = inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 0.0f : 1.0f / outputs[0]->attr.dtype.scale; - float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale; + float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]); vx_float32 logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); vsi_bool ret = FALSE; @@ -353,7 +353,6 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[SCALAR_BETA] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_LOGE] ); } - } } @@ -372,4 +371,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( swish, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c index f2e990c..6f46988 100644 --- a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c @@ -232,7 +232,6 @@ static vsi_status _query_kernel } return status; - } /* _query_kernel() */ @@ -254,11 +253,11 @@ static vsi_nn_kernel_node_t _setup int32_t scale_y = 0; vsi_bool image_2d = FALSE; vsi_bool is_use_u8_kernel = FALSE; - float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; - float outputTail = (float)outputs[0]->attr.dtype.zero_point; - float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; - float inputTail = (float)inputs[0]->attr.dtype.zero_point; - int32_t outputZp = outputs[0]->attr.dtype.zero_point; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + int32_t outputZp = vsi_nn_get_tensor_zero_point(outputs[0]); float scale_value = 1.0f; float tail_value = 0.0f; @@ -314,10 +313,8 @@ static vsi_nn_kernel_node_t _setup } return node; - } /* _setup() */ __END_DECLS REGISTER_BACKEND_CL( upsample, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cpu/cpu_backend_conv2d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_conv2d_cpu.c new file mode 100644 index 0000000..d273df6 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_conv2d_cpu.c @@ -0,0 +1,259 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "cpu_backend/npuref_interface.h" + +__BEGIN_DECLS + +typedef enum +{ + PARAM_INPUT = 0, + PARAM_KERNEL, + PARAM_BIAS, + PARAM_OUTPUT, + PARAM_STRIDE_0, + PARAM_STRIDE_1, + PARAM_PAD_0, + PARAM_PAD_1, + PARAM_PAD_2, + PARAM_PAD_3, + PARAM_DILATION_0, + PARAM_DILATION_1, + PARAM_MULTIPLIER, + PARAM_NUM +} param_index_e; +/* + * Define kernel meta. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.cpu_backend_conv2d") +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +/* + * Kernel params + */ +static vx_param_description_t _cpu_backend_conv2d_kernel_param_def[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, +}; +#define _CPU_BACKEND_CONV2D_PARAM_NUM _cnt_of_array( _cpu_backend_conv2d_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL }; + int32_t strides[2]; + int32_t pad[4]; + int32_t dilation[2]; + void * buffer[_IO_NUM] = { NULL }; + int32_t i = 0; + vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL }; + size_t out_elements = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[PARAM_INPUT]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[PARAM_KERNEL]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[PARAM_BIAS]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[PARAM_OUTPUT]; + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + if ( param[PARAM_BIAS] ) + { + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + } + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[0] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[1] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_0], &pad[0] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_1], &pad[1] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[2] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[3] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_DILATION_0], &dilation[0] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_DILATION_1], &dilation[1] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create kernel buffer fail.", final ); + if ( param[PARAM_BIAS] ) + { + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create bias buffer fail.", final ); + } + buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final ); + + npuref_interface_quant_conv2d(buffer[0], attr[0], + buffer[1], attr[1], buffer[2], + pad, strides, dilation, attr[3], buffer[3]); + + status = vsi_nn_kernel_tensor_write( tensors[3], attr[3], + buffer[3], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for ( i = 0; i < _IO_NUM; i ++ ) + { + if ( attr[i] ) + { + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + if ( buffer[i] ) + { + free( buffer[i] ); + } + } + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _cpu_backend_conv2d_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _cpu_backend_conv2d_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CPU_BACKEND_CONV2D_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + size_t size = 0; + int32_t* stride = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "stride", &size); + int32_t* pad = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "pad", &size); + int32_t* dilation = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "dilation", &size); + int32_t multiplier = vsi_nn_kernel_param_get_int32(params, "multiplier"); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CPU_BACKEND_CONV2D_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride[0] ); + node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &stride[1] ); + node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &pad[0] ); + node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &pad[1] ); + node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad[2] ); + node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad[3] ); + node_params[10] = vsi_nn_kernel_scalar_create( graph, I32, &dilation[0] ); + node_params[11] = vsi_nn_kernel_scalar_create( graph, I32, &dilation[1] ); + node_params[12] = vsi_nn_kernel_scalar_create( graph, I32, &multiplier ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_BACKEND_CONV2D_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( cpu_backend_conv2d, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/cpu_backend_deconv2d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_deconv2d_cpu.c new file mode 100644 index 0000000..b1502a5 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_deconv2d_cpu.c @@ -0,0 +1,245 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "cpu_backend/npuref_interface.h" + +__BEGIN_DECLS + +typedef enum +{ + PARAM_INPUT = 0, + PARAM_KERNEL, + PARAM_BIAS, + PARAM_OUTPUT, + PARAM_STRIDE_0, + PARAM_STRIDE_1, + PARAM_PAD_0, + PARAM_PAD_1, + PARAM_PAD_2, + PARAM_PAD_3, + + PARAM_NUM +} param_index_e; +/* + * Define kernel meta. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.cpu_backend_deconv2d") + + +/* + * Kernel params + */ +static vx_param_description_t _cpu_backend_deconv2d_kernel_param_def[] = +{ + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, +}; +#define _CPU_BACKEND_DECONV2D_PARAM_NUM _cnt_of_array( _cpu_backend_deconv2d_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL }; + int32_t strides[2]; + int dilation[2] = {1, 1}; + int32_t pad[4]; + void * buffer[_IO_NUM] = { NULL }; + int32_t i = 0; + vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL }; + size_t out_elements = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[PARAM_INPUT]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[PARAM_KERNEL]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[PARAM_BIAS]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[PARAM_OUTPUT]; + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + if ( param[PARAM_BIAS] ) + { + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + } + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[0] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[1] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_0], &pad[0] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_1], &pad[1] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[2] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[3] ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create kernel buffer fail.", final ); + if ( param[PARAM_BIAS] ) + { + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create bias buffer fail.", final ); + } + buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final ); + + npuref_interface_quant_deconv2d(buffer[0], attr[0], + buffer[1], attr[1], buffer[2], + pad, strides, dilation, attr[3], buffer[3]); + + status = vsi_nn_kernel_tensor_write( tensors[3], attr[3], + buffer[3], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for ( i = 0; i < _IO_NUM; i ++ ) + { + if ( attr[i] ) + { + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + if ( buffer[i] ) + { + free( buffer[i] ); + } + } + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _cpu_backend_deconv2d_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _cpu_backend_deconv2d_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CPU_BACKEND_DECONV2D_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + size_t size = 0; + int32_t* stride = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "stride", &size); + int32_t* pad = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "pad", &size); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _CPU_BACKEND_DECONV2D_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride[0] ); + node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &stride[1] ); + node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &pad[0] ); + node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &pad[1] ); + node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad[2] ); + node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad[3] ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_BACKEND_DECONV2D_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( cpu_backend_deconv2d, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c new file mode 100644 index 0000000..46de624 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c @@ -0,0 +1,275 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" +#include "cpu_backend/npuref_interface.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum +{ + PARAM_INPUT = 0, + PARAM_KERNEL, + PARAM_BIAS, + PARAM_OUTPUT, + PARAM_STRIDE, + PARAM_PAD_FRONT, + PARAM_PAD_END, + PARAM_DILATION, + PARAM_MULTIPLIER, + PARAM_NUM +} param_index_e; + +#define _INPUT_NUM (PARAM_NUM) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.depthwise_conv1d") +#define _IO_NUM (4) + +/* + * Kernel params + */ +static vx_param_description_t _depthwise_conv1d_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _DEPTHWISE_CONV1D_PARAM_NUM _cnt_of_array( _depthwise_conv1d_kernel_param_def ) + + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL }; + int32_t stride; + int32_t pad_front; + int32_t pad_end; + int32_t dilation; + int32_t multiplier; + void * buffer[_IO_NUM] = { NULL }; + int32_t i = 0; + vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL }; + size_t out_elements = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[PARAM_INPUT]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[PARAM_KERNEL]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[PARAM_BIAS]; + tensors[3] = (vsi_nn_kernel_tensor_t)param[PARAM_OUTPUT]; + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + if( param[PARAM_BIAS] ) + { + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + } + attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final ); + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); + + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE], &stride ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_FRONT], &pad_front ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_END], &pad_end ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_DILATION], &dilation ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[PARAM_MULTIPLIER], &multiplier ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); + + buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create kernel buffer fail.", final ); + if( param[PARAM_BIAS] ) + { + buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create bias buffer fail.", final ); + } + buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], FALSE ); + CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final ); + + + { + // Use conv2d compute + int32_t input_shape_4d[4] = {1,0,0,0}; + int32_t kernel_shape_4d[4] = {1,0,0,0}; + int32_t output_shape_4d[4] = {1,0,0,0}; + memcpy( &input_shape_4d[1], attr[0]->shape->data, 3 * sizeof(int32_t) ); + memcpy( &kernel_shape_4d[1], attr[1]->shape->data, 3 * sizeof(int32_t) ); + memcpy( &output_shape_4d[1], attr[3]->shape->data, 3 * sizeof(int32_t) ); + npuref_interface_quant_depthwise_conv2d( + buffer[0], buffer[1], buffer[2], + input_shape_4d, 4, + kernel_shape_4d, 4, + output_shape_4d, 4, + attr[0]->asymm.scale, attr[0]->asymm.zero_point, + attr[1]->asymm.scale, attr[1]->asymm.zero_point, + attr[3]->asymm.scale, attr[3]->asymm.zero_point, + pad_front, pad_end, 0, 0, + stride, 1, dilation, 1, + buffer[3] + ); + status = vsi_nn_kernel_tensor_write( tensors[3], attr[3], + buffer[3], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for( i = 0; i < _IO_NUM; i ++ ) + { + if( attr[i] ) + { + vsi_nn_kernel_tensor_attr_release( &attr[i] ); + } + if( buffer[i] ) + { + free( buffer[i] ); + } + } + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _depthwise_conv1d_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _depthwise_conv1d_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_DEPTHWISE_CONV1D_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); + int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" ); + int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); + int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" ); + int32_t multiplier = vsi_nn_kernel_param_get_int32( params, "multiplier" ); + + if(!( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 + && inputs[1]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 + && outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)) + { + //TODO: Support other types + return NULL; + } + + if( !npuref_exists() ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _DEPTHWISE_CONV1D_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[PARAM_STRIDE] = vsi_nn_kernel_scalar_create( graph, I32, &stride ); + node_params[PARAM_PAD_FRONT] = vsi_nn_kernel_scalar_create( graph, I32, &pad_front ); + node_params[PARAM_PAD_END] = vsi_nn_kernel_scalar_create( graph, I32, &pad_end ); + node_params[PARAM_DILATION] = vsi_nn_kernel_scalar_create( graph, I32, &dilation ); + node_params[PARAM_MULTIPLIER] = vsi_nn_kernel_scalar_create( graph, I32, &multiplier ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, + _DEPTHWISE_CONV1D_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[PARAM_STRIDE] ); + vsi_nn_kernel_scalar_release( &node_params[PARAM_PAD_FRONT] ); + vsi_nn_kernel_scalar_release( &node_params[PARAM_PAD_END] ); + vsi_nn_kernel_scalar_release( &node_params[PARAM_DILATION] ); + vsi_nn_kernel_scalar_release( &node_params[PARAM_MULTIPLIER] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( depthwise_conv1d, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c index a00cfcb..3aa63e2 100644 --- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c @@ -52,7 +52,7 @@ typedef enum } unary_type_e; -#define _CPU_ARG_NUM (2) +#define _CPU_ARG_NUM (3) #define _CPU_INPUT_NUM (1) #define _CPU_OUTPUT_NUM (1) #define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) @@ -84,9 +84,9 @@ static float neg_eval(float data) return data * -1.0f; } -static float hsigmoid_eval(float data) +static float hsigmoid_eval(float data, float alpha, float beta) { - data = (float)(0.2 * data + 0.5); + data = (float)(alpha * data + beta); data = vsi_nn_clamp(data, 0, 1); return data; @@ -177,6 +177,7 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; int32_t i; float alpha = 0; + float beta = 0; int32_t unary_type = 0; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; @@ -191,6 +192,8 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &alpha); CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &beta); + CHECK_STATUS_FAIL_GOTO(status, final ); buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final ); @@ -222,7 +225,7 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) data = neg_eval(data); break; case UNARY_HSIGMOID: - data = hsigmoid_eval(data); + data = hsigmoid_eval(data, alpha, beta); break; case UNARY_MISH: data = mish_eval(data); @@ -268,10 +271,12 @@ static vx_param_description_t kernel_param_def[] = {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define INPUT_FUNC_TYPE (2) #define INPUT_SCALAR_ALPHA (3) +#define INPUT_SCALAR_BETA (4) static const vx_kernel_description_t _kernel_info = { @@ -314,6 +319,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); + float beta = vsi_nn_kernel_param_get_float32( params, "beta" ); status = _query_kernel( inputs, outputs, kernel ); if( VSI_SUCCESS == status) @@ -328,11 +334,14 @@ static vsi_nn_kernel_node_t _setup graph, I32, &unary_type ); backend_params[INPUT_SCALAR_ALPHA] = vsi_nn_kernel_scalar_create( graph, F32, &alpha ); + backend_params[INPUT_SCALAR_BETA] = vsi_nn_kernel_scalar_create( + graph, F32, &beta ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); vsi_nn_kernel_scalar_release( &backend_params[INPUT_FUNC_TYPE] ); vsi_nn_kernel_scalar_release( &backend_params[INPUT_SCALAR_ALPHA] ); + vsi_nn_kernel_scalar_release( &backend_params[INPUT_SCALAR_BETA] ); } else { diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c index 13d10e7..33e8b33 100644 --- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c @@ -100,9 +100,9 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec) } indices_num /= coord_stride; - if(coord_stride <= 3) // reshape 3D + if(coord_stride <= 4) // reshape 3D { - vsi_ssize_t stride[3] = {block_size, 0, 0}; + vsi_ssize_t stride[4] = {block_size, 0, 0, 0}; for(i = 1; i < coord_stride; ++i) { stride[i] = stride[i - 1] * attr[0]->shape->data[i]; @@ -111,7 +111,7 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec) for(i = 0; i < indices_num; i++) { vsi_size_t out_index = i * block_size; - uint32_t coord[3] = {0}; + uint32_t coord[4] = {0}; vsi_size_t in_index = 0; int32_t j = 0; @@ -119,7 +119,7 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec) { coord[j] = buffer_idx[i * coord_stride + j]; } - in_index = coord[2] * stride[2] + coord[1] * stride[1] + coord[0] * stride[0]; + in_index = coord[3] * stride[3] + coord[2] * stride[2] + coord[1] * stride[1] + coord[0] * stride[0]; memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float)); } } diff --git a/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c b/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c new file mode 100644 index 0000000..f764c18 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c @@ -0,0 +1,507 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (4) +#define _OUTPUT_NUM (3) + #define _TENSOR_NUM (_INPUT_NUM + _OUTPUT_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.generate_proposals") + + +typedef struct vsi_nn_box_encoding_corner_t +{ + float x1, y1, x2, y2; +}vsi_nn_box_encoding_corner; + +typedef struct vsi_nn_box_encoding_center_t +{ + float w, h, x, y; +}vsi_nn_box_encoding_center; +/* + * Kernel params + */ +static vx_param_description_t _generate_proposals_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _GENERATE_PROPOSALS_PARAM_NUM _cnt_of_array( _generate_proposals_kernel_param_def ) + + +static void _to_box_encoding_corner + ( + vsi_nn_box_encoding_center* ctr, + vsi_nn_box_encoding_corner* cnr + ) +{ + cnr->x1 = ctr->x - ctr->w / 2; + cnr->y1 = ctr->y - ctr->h / 2; + cnr->x2 = ctr->x + ctr->w / 2; + cnr->y2 = ctr->y + ctr->h / 2; +} + +static void _to_box_encoding_center + ( + vsi_nn_box_encoding_corner* cnr, + vsi_nn_box_encoding_center* ctr + ) +{ + ctr->w = cnr->x2 - cnr->x1; + ctr->h = cnr->y2 - cnr->y1; + ctr->x = (cnr->x1 + cnr->x2) / 2; + ctr->y = (cnr->y1 + cnr->y2) / 2; +} + +static void _iota + ( + int32_t * data, + uint32_t len, + int32_t value + ) +{ + uint32_t i; + for (i = 0; i < len; i++) + { + data [i] = value; + value++; + } +} + +// swap_element is implemented in vsi_nn_kernel_box_with_nms_limit.c +void swap_element + ( + uint32_t* list, + uint32_t first, + uint32_t second + ); + +// max_element is implemented in vsi_nn_kernel_box_with_nms_limit.c +uint32_t max_element + ( + float* data, + uint32_t* index_list, + uint32_t len + ); + +// getIoUAxisAligned is implemented in vsi_nn_kernel_box_with_nms_limit.c +float getIoUAxisAligned + ( + const float* roi1, + const float* roi2 + ); + +// sort_element_by_score is implemented in vsi_nn_kernel_box_with_nms_limit.c +void sort_element_by_score + ( + float* data, + uint32_t* index_list, + uint32_t len + ); + +void _filter_boxes + ( + const float* roiBase, + const float* imageInfoBase, + float minSize, + uint32_t* select, + uint32_t* len + ) +{ + const uint32_t kRoiDim = 4; + uint32_t i = 0; + uint32_t j = 0; + + for (j = 0; j < *len; j++) + { + const float* roiInfo = roiBase + select[j] * kRoiDim; + float roiWidth, roiHeight, xRoiCenter, yRoiCenter; + roiWidth = roiInfo[2] - roiInfo[0]; + roiHeight = roiInfo[3] - roiInfo[1]; + xRoiCenter = roiInfo[0] + roiWidth / 2.0f; + yRoiCenter = roiInfo[1] + roiHeight / 2.0f; + if (roiWidth > minSize && roiHeight > minSize && xRoiCenter < imageInfoBase[1] + && yRoiCenter < imageInfoBase[0]) + { + select[i] = select[j]; + i++; + } + } + *len = i; +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + float heightStride; + float widthStride; + int32_t preNmsTopN; + int32_t postNmsTopN; + float iouThreshold; + float minSize; + + /* prepare data */ + for (i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + + } + for (i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM], &heightStride ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM + 1], &widthStride ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[_TENSOR_NUM + 2], &preNmsTopN ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32( param[_TENSOR_NUM + 3], &postNmsTopN ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM + 4], &iouThreshold ); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM + 5], &minSize ); + CHECK_STATUS_FAIL_GOTO(status, final ); + + { + uint32_t h, w, a, b, j; + const uint32_t kRoiDim = 4; + vsi_size_t numBatches = in_attr[0]->shape->data[3]; + vsi_size_t height = in_attr[0]->shape->data[2]; + vsi_size_t width = in_attr[0]->shape->data[1]; + vsi_size_t numAnchors = in_attr[0]->shape->data[0]; + vsi_size_t imageInfoLength = in_attr[3]->shape->data[0]; + + vsi_size_t batchSize = height * width * numAnchors; + vsi_size_t roiBufferSize = batchSize * kRoiDim; + + float * roiBuffer = (float*)malloc(roiBufferSize * sizeof(float)); + float * roiTransformedBuffer = (float*)malloc(roiBufferSize * sizeof(float)); + uint32_t* select = (uint32_t*)malloc(batchSize * sizeof(uint32_t)); + uint32_t index = 0; + vsi_size_t scores_index = 0; + vsi_size_t bboxDeltas_index = 0; + vsi_size_t imageInfo_index = 0; + uint32_t scores_out_index = 0; + uint32_t roi_out_index = 0; + + // Compute the roi region for each anchor. + for(h = 0; h < height; h++) + { + float hShift = h * heightStride; + for(w = 0; w < width; w++) + { + float wShift = w * widthStride; + uint32_t anchor_index = 0; + for(a = 0; a < numAnchors; a++) + { + roiBuffer[index] = f32_in_buffer[2][anchor_index] + wShift; + roiBuffer[index + 1] = f32_in_buffer[2][anchor_index + 1] + hShift; + roiBuffer[index + 2] = f32_in_buffer[2][anchor_index + 2] + wShift; + roiBuffer[index + 3] = f32_in_buffer[2][anchor_index + 3] + hShift; + + index += kRoiDim; + anchor_index += kRoiDim; + } + } + } + + for (b = 0; b < numBatches; b++) + { + const uint32_t roiLength = 4; + + vsi_size_t numRois = batchSize; + vsi_size_t roiIndex; + uint32_t select_len; + int32_t numDetections = 0; + for (roiIndex = 0; roiIndex < numRois; roiIndex++) + { + float imageHeight = f32_in_buffer[3][imageInfo_index]; + float imageWidth = f32_in_buffer[3][imageInfo_index + 1]; + vsi_nn_box_encoding_corner roi_cnr; + vsi_nn_box_encoding_center roiBefore; + roi_cnr.x1 = roiBuffer[roiIndex * roiLength]; + roi_cnr.y1 = roiBuffer[roiIndex * roiLength + 1]; + roi_cnr.x2 = roiBuffer[roiIndex * roiLength + 2]; + roi_cnr.y2 = roiBuffer[roiIndex * roiLength + 3]; + _to_box_encoding_center(&roi_cnr, &roiBefore); + { + vsi_nn_box_encoding_center roi_ctr; + vsi_nn_box_encoding_corner roiAfter; + vsi_nn_box_encoding_corner cliped; + vsi_size_t idx = bboxDeltas_index + roiIndex * roiLength; + roi_ctr.w = (float)(exp(f32_in_buffer[1][idx + 2]) * roiBefore.w); + roi_ctr.h = (float)(exp(f32_in_buffer[1][idx + 3]) * roiBefore.h); + roi_ctr.x = roiBefore.x + f32_in_buffer[1][idx] * roiBefore.w; + roi_ctr.y = roiBefore.y + f32_in_buffer[1][idx + 1] * roiBefore.h; + _to_box_encoding_corner(&roi_ctr, &roiAfter); + cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth); + cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight); + cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth); + cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight); + roiTransformedBuffer[idx] = cliped.x1; + roiTransformedBuffer[idx + 1] = cliped.y1; + roiTransformedBuffer[idx + 2] = cliped.x2; + roiTransformedBuffer[idx + 3] = cliped.y2; + } + } + + // Find the top preNmsTopN scores. + _iota((int32_t*)select, (uint32_t)batchSize, 0); + select_len = (uint32_t)batchSize; + if(preNmsTopN > 0 && preNmsTopN < (int32_t)batchSize) + { + sort_element_by_score(&(f32_in_buffer[0][scores_index]), + select, (uint32_t)batchSize); + select_len = preNmsTopN; + } + + // Filter boxes, disgard regions with height or width < minSize. + _filter_boxes(roiTransformedBuffer, &(f32_in_buffer[3][0]), + minSize, select, &select_len); + + // Apply hard NMS. + if (postNmsTopN < 0) + { + postNmsTopN = select_len; + } + + for (j = 0; (j < select_len && numDetections < postNmsTopN); j++) + { + // find max score and swap to the front. + int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]), + &(select[j]), select_len - j) + j; + swap_element(select, max_index, j); + + // Calculate IoU of the rest, swap to the end (disgard) ifneeded. + for (i = j + 1; i < select_len; i++) + { + int32_t roiBase0 = select[i] * kRoiDim; + int32_t roiBase1 = select[j] * kRoiDim; + float iou = getIoUAxisAligned(&(roiTransformedBuffer[roiBase0]), + &(roiTransformedBuffer[roiBase1])); + + if (iou >= iouThreshold) + { + swap_element(select, i, select_len - 1); + i--; + select_len--; + } + } + numDetections++; + } + + for (i = 0; i < select_len; i++) + { + memcpy(&(f32_out_buffer[1][roi_out_index]), + &(roiTransformedBuffer[select[i] * kRoiDim]), kRoiDim * sizeof(float)); + f32_out_buffer[0][scores_out_index] = + f32_in_buffer[0][scores_index + select[i]]; + f32_out_buffer[2][scores_out_index] = (float)b; + scores_out_index++; + roi_out_index += kRoiDim; + } + + scores_index += batchSize; + bboxDeltas_index += roiBufferSize; + imageInfo_index += imageInfoLength; + } + + vsi_nn_safe_free(roiBuffer); + vsi_nn_safe_free(roiTransformedBuffer); + vsi_nn_safe_free(select); + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _generate_proposals_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _generate_proposals_kernel_param_def ); + status = VSI_SUCCESS; + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GENERATE_PROPOSALS_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + float height_stride = vsi_nn_kernel_param_get_float32( params, "height_stride"); + float width_stride = vsi_nn_kernel_param_get_float32( params, "width_stride"); + int32_t pre_nms_top_n = vsi_nn_kernel_param_get_int32( params, "pre_nms_top_n"); + int32_t post_nms_top_n = vsi_nn_kernel_param_get_int32( params, "post_nms_top_n"); + float iou_threshold = vsi_nn_kernel_param_get_float32(params, "iou_threshold"); + float min_size = vsi_nn_kernel_param_get_float32(params, "min_size"); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GENERATE_PROPOSALS_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[_TENSOR_NUM ] = vsi_nn_kernel_scalar_create( graph, F32, &height_stride ); + node_params[_TENSOR_NUM + 1] = vsi_nn_kernel_scalar_create( graph, F32, &width_stride ); + node_params[_TENSOR_NUM + 2] = vsi_nn_kernel_scalar_create( graph, I32, &pre_nms_top_n ); + node_params[_TENSOR_NUM + 3] = vsi_nn_kernel_scalar_create( graph, I32, &post_nms_top_n ); + node_params[_TENSOR_NUM + 4] = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold ); + node_params[_TENSOR_NUM + 5] = vsi_nn_kernel_scalar_create( graph, F32, &min_size ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GENERATE_PROPOSALS_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM ] ); + vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 1] ); + vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 2] ); + vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 3] ); + vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 4] ); + vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 5] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( generate_proposals, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_z_h_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_z_h_cpu.c new file mode 100644 index 0000000..a5bd220 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_z_h_cpu.c @@ -0,0 +1,261 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (GRUCELL_ACT_Z_H_IN_CNT) +#define _OUTPUT_NUM (GRUCELL_ACT_Z_H_OUT_CNT) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.grucell_activation_z_h") + + +/* + * Kernel params + */ +static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, /*activation*/ + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, /*recurrent_activation*/ +}; +#define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM _cnt_of_array( _grucell_activation_z_h_kernel_param_def ) +#define SCALAR_ACTIVATION (7) +#define SCALAR_R_ACTIVATION (8) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; + vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t i, b; + int32_t activation = 0; + int32_t recurrent_activation = 0; + vsi_size_t n_batch = 0; + vsi_size_t n_cell = 0; + + /* prepare data */ + for ( i = 0; i < _INPUT_NUM; i++ ) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + if (input[i]) + { + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + } + + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + if (output[i]) + { + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ACTIVATION], &activation ); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_R_ACTIVATION], + &recurrent_activation ); + CHECK_STATUS_FAIL_GOTO(status, final ); + n_cell = in_attr[GRUCELL_ACT_Z_H_HSTATE]->shape->data[0]; + n_batch = in_attr[GRUCELL_ACT_Z_H_HSTATE]->shape->data[1]; + + for (b = 0; b < n_batch; b ++) + { + for (i = 0; i < n_cell; i++) + { + vsi_size_t index = i + n_cell * b; + float data_z_t = 0; + float data_h_t = 0; + float hstate_in = f32_in_buffer[GRUCELL_ACT_Z_H_HSTATE][index]; + float dst = 0; + + data_z_t = f32_in_buffer[GRUCELL_ACT_Z_H_I_FC_Z][index]; + data_z_t += f32_in_buffer[GRUCELL_ACT_Z_H_H_FC_Z][index]; + data_z_t = vsi_nn_activation(data_z_t, recurrent_activation); + + data_h_t = f32_in_buffer[GRUCELL_ACT_Z_H_I_FC_H][index]; + data_h_t += f32_in_buffer[GRUCELL_ACT_Z_H_H_FC_H][index]; + data_h_t = vsi_nn_activation(data_h_t, activation); + + dst = (1 - data_z_t ) * data_h_t + data_z_t * hstate_in; + + f32_out_buffer[GRUCELL_ACT_Z_H_OUT_OUTPUT][index] = dst; + f32_out_buffer[GRUCELL_ACT_Z_H_OUT_HSTATE][index] = dst; + } + } + + /* save data */ + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (output[i]) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_SUCCESS; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _grucell_activation_z_h_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _grucell_activation_z_h_kernel_param_def ); + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_Z_H_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" ); + int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + node_params[SCALAR_ACTIVATION] = vsi_nn_kernel_scalar_create( + graph, I32, &activation ); + node_params[SCALAR_R_ACTIVATION] = vsi_nn_kernel_scalar_create( + graph, I32, &recurrent_activation ); + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ACTIVATION] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_R_ACTIVATION] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( grucell_activation_z_h, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_h_times_activation_r_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_h_times_activation_r_cpu.c new file mode 100644 index 0000000..b61f92e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/grucell_h_times_activation_r_cpu.c @@ -0,0 +1,245 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.grucell_h_times_activation_r") + + +/* + * Kernel params + */ +static vx_param_description_t _grucell_h_times_activation_r_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, /*recurrent_activation*/ + // Add kererl parameters here +}; +#define _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def ) +#define SCALAR_R_ACTIVATION (4) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; + vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t i, b; + int32_t recurrent_activation = 0; + vsi_size_t n_batch = 0; + vsi_size_t n_cell = 0; + + /* prepare data */ + for( i = 0; i < _INPUT_NUM; i++ ) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + if (input[i]) + { + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + } + + for( i = 0; i < _OUTPUT_NUM; i++ ) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + if (output[i]) + { + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_R_ACTIVATION], + &recurrent_activation ); + CHECK_STATUS_FAIL_GOTO(status, final ); + n_cell = in_attr[0]->shape->data[0]; + n_batch = in_attr[0]->shape->data[1]; + + for (b = 0; b < n_batch; b ++) + { + for (i = 0; i < n_cell; i++) + { + vsi_size_t index = i + n_cell * b; + float data_r_t = 0; + float r_times_h = 0; + float hstate_in = f32_in_buffer[0][index]; + + data_r_t = f32_in_buffer[1][index]; + data_r_t += f32_in_buffer[2][index]; + + data_r_t = vsi_nn_activation(data_r_t, recurrent_activation); + + r_times_h = hstate_in * data_r_t; + + f32_out_buffer[0][index] = r_times_h; + } + } + + /* save data */ + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (output[i]) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for(i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_SUCCESS; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _grucell_h_times_activation_r_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def ); + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_R_ACTIVATION] = vsi_nn_kernel_scalar_create( + graph, I32, &recurrent_activation ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_R_ACTIVATION] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( grucell_h_times_activation_r, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_reset_after_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_reset_after_activation_cpu.c new file mode 100644 index 0000000..cfd0eb1 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/grucell_reset_after_activation_cpu.c @@ -0,0 +1,271 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (GRUCELL_ACT_IN_CNT) +#define _OUTPUT_NUM (GRUCELL_ACT_OUT_CNT) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.grucell_reset_after_activation") + + +/* + * Kernel params + */ +static vx_param_description_t _grucell_reset_after_activation_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, /*activation*/ + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, /*recurrent_activation*/ + // Add kererl parameters here +}; +#define _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM _cnt_of_array( _grucell_reset_after_activation_kernel_param_def ) +#define SCALAR_ACTIVATION (9) +#define SCALAR_R_ACTIVATION (10) +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; + vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t i, b; + int32_t activation = 0; + int32_t recurrent_activation = 0; + vsi_size_t n_batch = 0; + vsi_size_t n_cell = 0; + + /* prepare data */ + for ( i = 0; i < _INPUT_NUM; i++ ) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + if (input[i]) + { + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + } + + for ( i = 0; i < _OUTPUT_NUM; i++ ) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + if (output[i]) + { + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + } + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ACTIVATION], &activation ); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_R_ACTIVATION], + &recurrent_activation ); + CHECK_STATUS_FAIL_GOTO(status, final ); + n_cell = in_attr[GRUCELL_ACT_H_STATE]->shape->data[0]; + n_batch = in_attr[GRUCELL_ACT_H_STATE]->shape->data[1]; + + for (b = 0; b < n_batch; b ++) + { + for (i = 0; i < n_cell; i++) + { + vsi_size_t index = i + n_cell * b; + float data_z_t = 0; + float data_r_t = 0; + float data_h_t = 0; + float r_times_h = 0; + float hstate_in = f32_in_buffer[GRUCELL_ACT_H_STATE][index]; + float dst = 0; + + data_z_t = f32_in_buffer[GRUCELL_ACT_I_FC_Z][index]; + data_r_t = f32_in_buffer[GRUCELL_ACT_I_FC_R][index]; + data_h_t = f32_in_buffer[GRUCELL_ACT_I_FC_H][index]; + data_z_t += f32_in_buffer[GRUCELL_ACT_H_FC_Z][index]; + data_r_t += f32_in_buffer[GRUCELL_ACT_H_FC_R][index]; + + data_z_t = vsi_nn_activation(data_z_t, recurrent_activation); + data_r_t = vsi_nn_activation(data_r_t, recurrent_activation); + + r_times_h = f32_in_buffer[GRUCELL_ACT_H_FC_H][index] * data_r_t; + data_h_t += r_times_h; + + data_h_t = vsi_nn_activation(data_h_t, activation); + + dst = (1 - data_z_t ) * data_h_t + data_z_t * hstate_in; + + f32_out_buffer[GRUCELL_ACT_OUT_OUTPUT][index] = dst; + f32_out_buffer[GRUCELL_ACT_OUT_H_STATE][index] = dst; + } + } + + /* save data */ + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (output[i]) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_SUCCESS; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _grucell_reset_after_activation_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _grucell_reset_after_activation_kernel_param_def ); + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" ); + int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ACTIVATION] = vsi_nn_kernel_scalar_create( + graph, I32, &activation ); + node_params[SCALAR_R_ACTIVATION] = vsi_nn_kernel_scalar_create( + graph, I32, &recurrent_activation ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM ); + + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ACTIVATION] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_R_ACTIVATION] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( grucell_reset_after_activation, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c index 94b64d6..eff26ed 100644 --- a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c @@ -87,8 +87,8 @@ DEF_KERNEL_EXECUTOR(_prelu_exec) attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); - vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] ); - vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] ); + vsi_nn_shape_get_stride( attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size, stride_size[0] ); + vsi_nn_shape_get_stride( attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size, stride_size[1] ); out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] ); diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c new file mode 100644 index 0000000..adb0620 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c @@ -0,0 +1,307 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.resize_bilinear_nhwc") + + +/* + * Kernel params + */ +static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_BILINEAR_NHWC_PARAM_NUM _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def ) + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float *f32_in_buffer[_INPUT_NUM] = {NULL}; + float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; + vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + int32_t align_corners; + int32_t half_pixel_centers; + float width_scale; + float height_scale; + vsi_size_t input_width, output_width, input_height, output_height; + vsi_size_t b = 0, d = 0, w = 0, h = 0; + vsi_size_t output_depth, input_depth; + vsi_size_t output_batch; + vsi_size_t output_dims; + float data00 = .0f, data01 = .0f, data10 = .0f, data11 = .0f, interpolation = .0f; + vsi_size_t input_width_orig; + vsi_size_t output_width_orig; + vsi_size_t index; + + /* prepare data */ + for(i = 0; i < _INPUT_NUM; i ++) + { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] ); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final ); + } + for(i = 0; i < _OUTPUT_NUM; i ++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] ); + vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] ); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] ); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float *)malloc( out_bytes[i] ); + CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final ); + memset( f32_out_buffer[i], 0, out_bytes[i] ); + } + + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners)); + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers)); + input_width = in_attr[0]->shape->data[1]; + input_height = in_attr[0]->shape->data[2]; + output_width = out_attr[0]->shape->data[1]; + output_height = out_attr[0]->shape->data[2]; + output_dims = (vsi_size_t)out_attr[0]->shape->size; + output_depth = out_attr[0]->shape->data[0]; + output_batch = output_dims > 3 ? out_attr[0]->shape->data[3] : 1; + input_depth = in_attr[0]->shape->data[0]; + input_width_orig = input_width; + output_width_orig = output_width; + + if (align_corners && output_width > 1) + { + width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1); + } + else + { + width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width; + } + + if (align_corners && output_height > 1) + { + height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(output_height - 1); + } + else + { + height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)output_height; + } + + for (b = 0; b < output_batch; b ++) + { + vsi_ssize_t input_base = b * input_depth * input_width_orig * input_height; + vsi_ssize_t output_base = b * output_depth * output_width_orig * output_height; + + for (h = 0; h < output_height; h++) + { + vx_float32 input_h = h * height_scale; + vsi_size_t h0; + vsi_size_t h1; + + if (half_pixel_centers) + { + input_h = ((vx_float32)h + 0.5f) * height_scale - 0.5f; + } + else + { + input_h = h * height_scale; + } + h0 = (vsi_size_t)input_h; + h1 = input_h < 0 ? 0 : vsi_nn_min(h0 + 1, input_height - 1); + for (w = 0; w < output_width; w++) + { + vx_float32 input_w; + vsi_ssize_t w0; + vsi_ssize_t w1; + if (half_pixel_centers) + { + input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f; + } + else + { + input_w = w * width_scale; + } + w0 = (vsi_ssize_t)input_w; + w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vsi_ssize_t)(input_width - 1)); + + for (d = 0; d < output_depth; d++) + { + index = input_base + h0 * input_width_orig * input_depth + w0 * input_depth + d; + data00 = f32_in_buffer[0][index]; + index = input_base + h0 * input_width_orig * input_depth + w1 * input_depth + d; + data01 = f32_in_buffer[0][index]; + index = input_base + h1 * input_width_orig * input_depth + w0 * input_depth + d; + data10 = f32_in_buffer[0][index]; + index = input_base + h1 * input_width_orig * input_depth + w1 * input_depth + d; + data11 = f32_in_buffer[0][index]; + + interpolation = data00 * (1 - (input_h - h0)) * (1 - (input_w - w0)) + + data10 * (input_h - h0) * (1 - (input_w - w0)) + + data01 * (1 - (input_h - h0)) * (input_w - w0) + + data11 * (input_h - h0) * (input_w - w0); + index = output_base + h * output_width_orig * output_depth + w * output_depth + d; + f32_out_buffer[0][index] = interpolation; + } + } + } + } + + /* save data */ + for (i = 0; i < _OUTPUT_NUM; i++) + { + status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i], + f32_out_buffer[i], out_elements[i] ); + CHECK_STATUS_FAIL_GOTO( status, final ); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) + { + if (f32_in_buffer[i]) + { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + if (in_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &in_attr[i] ); + } + } + for (i = 0; i < _OUTPUT_NUM; i++) + { + if (f32_out_buffer[i]) + { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + if (out_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &out_attr[i] ); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _resize_bilinear_nhwc_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def ); + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + + status = _query_kernel( kernel, inputs, outputs ); + if( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( resize_bilinear_nhwc, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c index 78c7752..303b3fb 100644 --- a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c @@ -60,19 +60,6 @@ static vx_param_description_t _topk_kernel_param_def[] = }; #define _TOPK_PARAM_NUM _cnt_of_array( _topk_kernel_param_def ) -static uint32_t _max_comp_func(void* data, int32_t left, int32_t right) -{ - float* fdata = (float*)data; - if (fdata[left] >= fdata[right]) - { - return TRUE; - } - else - { - return FALSE; - } -} - static void _find_top_k_1d ( float* input, @@ -81,37 +68,35 @@ static void _find_top_k_1d float* value, uint32_t* indices ) -{ - int32_t low = 0; - int32_t high = input_len - 1; - int32_t j; - - for (j = 0; j < (int32_t)input_len; j++) +{ // Insertion sort + float insert_elem; + uint32_t position,index=0; + uint32_t i, j; + for (i = 0; i < input_len; i++) { - indices[j] = j; - } - - j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices); - - //part_sort - while (j != (int32_t)k) - { - if ((int32_t)k > j) + insert_elem = input[i]; + // Record the position of the target element, + // and start traversing from this position forward + position = i; + index = position; + // Traverse forward from position to find the insertion position of the target element + while (position > 0 && input[position - 1] < insert_elem) { - low = j + 1; + // The element at position moves one position backward, index will also move with it + input[position] = input[position - 1]; + indices[position] = indices[position - 1]; + position--; } - else + // Insert and record the final position + if (position != i) { - high = j; + input[position] = insert_elem; } - j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices); + indices[position] = index; } - //all_sort - vsi_nn_partition(input, 0, k - 1, _max_comp_func, TRUE, indices); - - for (j = 0; j < (int32_t)k; j++) + for (j = 0; j < k; j++) { - value[j] = input[indices[j]]; + value[j] = input[j]; } } @@ -138,7 +123,7 @@ DEF_KERNEL_EXECUTOR(_compute) uint32_t i = 0; int32_t j = 0; int32_t top_k = 0; - uint32_t block_num = 0; + uint32_t block_num = 1; uint32_t block_size = 0; uint32_t * indices_ptr = NULL; @@ -166,7 +151,11 @@ DEF_KERNEL_EXECUTOR(_compute) status = vsi_nn_kernel_scalar_read_int32( param[3], &top_k ); CHECK_STATUS_FAIL_GOTO(status, final ); - block_num = (uint32_t)in_attr[0]->shape->data[1]; + for(i = (uint32_t)in_attr[0]->shape->size - 1; i > 0; i--) + { + block_num = block_num * (uint32_t)in_attr[0]->shape->data[i]; + } + block_size = (uint32_t)in_attr[0]->shape->data[0]; indices_ptr = (uint32_t*)malloc(block_size * sizeof(uint32_t)); CHECK_PTR_FAIL_GOTO( indices_ptr, "Create indices buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c index 74dfc35..679a07d 100644 --- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c @@ -375,7 +375,6 @@ final: vsi_nn_kernel_tensor_attr_release(&output_attr); } return status; - } /* _add_mean_std_norm_initializer() */ @@ -433,7 +432,6 @@ static vsi_status _query_kernel } return status; - } /* _query_kernel() */ @@ -467,7 +465,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U8 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vsi_nn_kernel_node_set_border( node, &border ); VSI_ASSERT( status == VSI_SUCCESS ); @@ -484,10 +482,8 @@ static vsi_nn_kernel_node_t _setup } return node; - } /* _setup() */ __END_DECLS REGISTER_BACKEND_EVIS( add_mean_std_norm, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c index a52e76a..d74b7be 100644 --- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c @@ -699,4 +699,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( conv1d_ovxlib, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c index 8888e15..feab3a0 100644 --- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c @@ -783,7 +783,7 @@ static vsi_nn_kernel_node_t _setup if (VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type && VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == inputs[0]->attr.dtype.qnt_type) { - border.constant_value.U8 = (uint8_t)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } else { @@ -835,4 +835,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( depthwise_conv1d, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c index e78d9a9..1b99cb1 100644 --- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -336,10 +336,12 @@ static vx_param_description_t kernel_param_def[] = {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define INPUT_FUNC_TYPE (2) #define INPUT_SCALAR_ALPHA (3) +#define INPUT_SCALAR_BETA (4) #define _CL_PARAM_NUM _cnt_of_array(kernel_param_def) /* @@ -368,6 +370,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) float outputScale = 1.0f; float outputZP = 0; float alpha = 0; + float beta = 0; uint32_t pack_key; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -379,6 +382,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[INPUT_SCALAR_ALPHA], &alpha); CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[INPUT_SCALAR_BETA], &beta); + CHECK_STATUS_FAIL_GOTO(status, final ); out_shape = attr[1]->shape; @@ -487,6 +492,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, "alpha", &alpha ); + status |= vsi_nn_kernel_gpu_add_param( node, + "beta", &beta ); CHECK_STATUS_FAIL_GOTO(status, final ); } break; @@ -547,6 +554,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) "outputZP", &outputZP ); status |= vsi_nn_kernel_gpu_add_param( node, "alpha", &alpha ); + status |= vsi_nn_kernel_gpu_add_param( node, + "beta", &beta ); if (attr[1]->dtype == F16) { @@ -638,6 +647,7 @@ static vsi_nn_kernel_node_t _setup vsi_size_t new_rank = 0; vsi_bool ret = FALSE; float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); + float beta = vsi_nn_kernel_param_get_float32( params, "beta" ); ret = vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, @@ -670,6 +680,8 @@ static vsi_nn_kernel_node_t _setup graph, I32, &unary_type ); node_params[INPUT_SCALAR_ALPHA] = vsi_nn_kernel_scalar_create( graph, F32, &alpha ); + node_params[INPUT_SCALAR_BETA] = vsi_nn_kernel_scalar_create( + graph, F32, &beta ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); @@ -698,6 +710,11 @@ OnError: vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_ALPHA] ); } + if (node_params[INPUT_SCALAR_BETA]) + { + vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_BETA] ); + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index d49d92d..e5b12f7 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -289,7 +289,8 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) } shaderParam.global_scale[0] = 16; - if (attr[0]->dtype == I16 || attr[0]->dtype == F16) + if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || + attr[0]->dtype == BF16 || attr[0]->dtype == U16) { shaderParam.global_scale[0] = 8; } diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c index c206930..78e9efe 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c @@ -51,6 +51,7 @@ __BEGIN_DECLS typedef enum { + _error = -1, _1D = 0, _2D, _3D @@ -168,6 +169,10 @@ static vsi_status get_gather_nd_tensor_reshape_size sizes[0] = block_size; sizes[1] = elementCnt / block_size; } + else if(coordDim == 4) + { + newDim[0] = 3; + } status = VSI_SUCCESS; } @@ -381,7 +386,7 @@ static vsi_status _query_kernel vsi_status status = VSI_FAILURE; vsi_nn_kernel_dtype_e input0_dtype = U8; vsi_nn_kernel_dtype_e output_dtype = U8; - vsi_nn_kernel_coord_type_e coord_type = _1D; + vsi_nn_kernel_coord_type_e coord_type = _error; uint32_t key = 0; int i = 0; @@ -404,7 +409,7 @@ static vsi_status _query_kernel { coord_type = _2D; } - else if(coord_dim == 3) + else if(coord_dim == 3 || coord_dim == 4) { coord_type = _3D; } diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c index f70df19..2894f11 100644 --- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c @@ -56,6 +56,9 @@ typedef enum #define KERNEL_SOURCE_3 "group_normalization_i16" #define KERNEL_SOURCE_4 "group_normalization_f16" #define KERNEL_SOURCE_5 "group_normalization_u8_f16" +#define KERNEL_SOURCE_6 "group_normalization_i8_scale" +#define KERNEL_SOURCE_7 "group_normalization_i16_scale" +#define KERNEL_SOURCE_8 "group_normalization_f16_scale" #define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(SRC0_TYPE) \ CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE) @@ -72,6 +75,12 @@ typedef enum #define HASH_GROUPNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") +#define HASH_GROUPNORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + +#define HASH_GROUPNORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D") + // Add kernel hashtable here // Sum Sqr #define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \ @@ -96,19 +105,29 @@ typedef enum SOURCE }, // normalization -#define HASH_GROUPNORM_KEY(_input0_type, _output_type, _reshape_flag) \ - ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) +#define HASH_GROUPNORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4)) #define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \ + { HASH_GROUPNORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \ HASH_GROUPNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 1), \ + { HASH_GROUPNORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \ HASH_GROUPNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, +#define TENSOR_GROUPNORM_SCALE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_GROUPNORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + +#define TENSOR_GROUPNORM_SCALE_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + HASH_GROUPNORM_SCALE_SH_KERNEL_2D_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE }, + typedef struct { uint32_t key; @@ -157,6 +176,26 @@ static const _kernel_map_type _groupnorm_kernel_map[] = TENSOR_GROUPNORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 ) TENSOR_GROUPNORM_KERNELS( F16, U8, KERNEL_SOURCE_4 ) TENSOR_GROUPNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_4 ) + + TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, U8, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, U8, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, F16, KERNEL_SOURCE_5 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, F16, KERNEL_SOURCE_5 ) + + TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, I8, KERNEL_SOURCE_6 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, I8, KERNEL_SOURCE_6 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, F16, KERNEL_SOURCE_6 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, F16, KERNEL_SOURCE_6 ) + + TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, I16, KERNEL_SOURCE_7 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, I16, KERNEL_SOURCE_7 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, F16, KERNEL_SOURCE_7 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, F16, KERNEL_SOURCE_7 ) + + TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, U8, KERNEL_SOURCE_8 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_8 ) + TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_8 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_8 ) }; /* @@ -483,7 +522,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; + vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL, NULL, NULL}; vsi_size_array_t * input_shape = NULL; float scaleIn = 1.0f; float scaleOut = 1.0f; @@ -501,6 +540,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); + attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); + CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &is2D); CHECK_STATUS_FAIL_GOTO(status, OnError ); @@ -735,8 +776,14 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype ); - status = vsi_nn_kernel_gpu_add_param(node, "height", &height); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + if (attr[3]->dtype != F32) + { + status = vsi_nn_kernel_gpu_add_param(node, "height", &height); + } + if (!(attr[3]->dtype == F32 && (attr[0]->dtype == I16 || attr[0]->dtype == I8))) + { + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + } CHECK_STATUS_FAIL_GOTO(status, OnError ); switch( pack_key ) @@ -865,6 +912,11 @@ OnError: vsi_nn_kernel_tensor_attr_release( &attr[2] ); attr[2] = NULL; } + if (attr[3]) + { + vsi_nn_kernel_tensor_attr_release( &attr[3] ); + attr[3] = NULL; + } return status; } @@ -1001,6 +1053,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_dtype_e in0_dtype = U8; + vsi_nn_kernel_dtype_e in2_dtype = F16; vsi_nn_kernel_dtype_e out_dtype = U8; vsi_nn_tensor_attr_t attr; vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; @@ -1040,11 +1093,12 @@ static vsi_nn_kernel_node_t _setup } in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg ); hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 ); - hashkey = HASH_GROUPNORM_KEY( in0_dtype, out_dtype, is2D_flg ); + hashkey = HASH_GROUPNORM_KEY( in0_dtype, in2_dtype, out_dtype, is2D_flg ); status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR ); if ( VSI_SUCCESS != status ) @@ -1104,7 +1158,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U16 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -1134,7 +1188,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U16 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)tmp_node1, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -1177,7 +1231,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U16 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -1216,4 +1270,3 @@ final: __END_DECLS REGISTER_BACKEND_EVIS( group_norm, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c new file mode 100644 index 0000000..69057be --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c @@ -0,0 +1,382 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum _grucell_nn_activation_type_e +{ + SIGMOID = VSI_NN_ACT_SIGMOID, + HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, +}grucell_nn_activation_type_e; + +#define _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE "grucell_activation_z_h" + +// Add kernel hashtable here +#define GRUCELL_ACTIVATION_Z_H_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 )) + +#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + { GRUCELL_ACTIVATION_Z_H_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \ + CVIVANTE_NAMESPACE("evis.grucell_activation_z_h_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \ + _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_activation_z_h_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, F16, U8, SIGMOID ), + PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ), + PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), +}; + +/* + * Kernel params + */ +static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM _cnt_of_array( _grucell_activation_z_h_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t hstate_out = NULL; + vsi_nn_kernel_tensor_t output = NULL; + float hstate_in_scale = 1.0f; + float hstate_in_tail = 0; + float output_scale = 1.0f; + float output_zp = 0; + uint32_t i = 0; + uint32_t pack_key = 0; + vsi_nn_kernel_tensor_attr_t* input_attr[GRUCELL_ACT_Z_H_IN_CNT] = {NULL}; + vsi_nn_kernel_tensor_attr_t* output_attr[2] = {NULL}; +#define _PACK_SELECT_KEY( hstate_type, fc_type, output_type ) \ + (hstate_type | (fc_type << 8) | (output_type << 16)) + + output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_OUTPUT]; + hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_HSTATE]; + + for (i = 0; i < GRUCELL_ACT_Z_H_IN_CNT; i++) + { + input_attr[i] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[i] ); + CHECK_PTR_FAIL_GOTO( input_attr[i], "Create tensor attr buffer fail.", final ); + } + + output_attr[0] = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final ); + output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out ); + CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final ); + + if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant ) + { + int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl; + if (srcFixPointPos >= 0) + hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); + else if (srcFixPointPos < 0) + hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos); + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant ) + { + hstate_in_scale = input_attr[0]->asymm.scale; + hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale; + } + + if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant ) + { + int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl; + if (srcFixPointPos >= 0) + output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos); + else if (srcFixPointPos < 0) + output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos); + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant ) + { + output_scale = 1.0f / output_attr[0]->asymm.scale; + output_zp = (float)output_attr[0]->asymm.zero_point; + } + + pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype); + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_attr[1]->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_attr[1]->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + switch (pack_key) + { + case _PACK_SELECT_KEY(F16, F16, F16): + { + gpu_dp_inst_t uniExtractH4_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertF16_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractH4_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY(U8, F16, U8): + case _PACK_SELECT_KEY(I8, F16, I8): + case _PACK_SELECT_KEY(I16, F16, I16): + { + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertF16_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_scale", &hstate_in_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + for (i = 0; i < GRUCELL_ACT_Z_H_IN_CNT; i++) + { + if (input_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &input_attr[i] ); + } + } + if (output_attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &output_attr[0] ); + } + + if (output_attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &output_attr[1] ); + } + return status; +} /* _grucell_activation_z_h_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t recurrent_activation + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e hstate_dtype; + vsi_nn_kernel_dtype_e fc_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _grucell_activation_z_h_kernel_map; + size_t kernel_map_size = _cnt_of_array( _grucell_activation_z_h_kernel_map ); + vx_param_description_t * param_def = _grucell_activation_z_h_kernel_param_def; + vx_kernel_initialize_f initializer = _grucell_activation_z_h_initializer; + + uint32_t key; + uint32_t i; + + hstate_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dtype.vx_type ); + fc_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_Z_H_I_FC_Z]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dtype.vx_type ); + + key = GRUCELL_ACTIVATION_Z_H_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _grucell_activation_z_h_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_Z_H_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" ); + int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + + if( activation != VSI_NN_ACT_TANH ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, recurrent_activation ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( grucell_activation_z_h, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c new file mode 100644 index 0000000..5ba28e6 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c @@ -0,0 +1,352 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum _grucell_nn_activation_type_e +{ + SIGMOID = VSI_NN_ACT_SIGMOID, + HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, +}grucell_nn_activation_type_e; + +#define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE "grucell_h_times_activation_r" + +// Add kernel hashtable here +#define GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 )) +#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + { GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \ +CVIVANTE_NAMESPACE("evis.grucell_h_times_activation_r_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \ +_GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _grucell_h_times_activation_r_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t output = NULL; + float hstate_in_scale = 1.0f; + float hstate_in_tail = 0; + uint32_t i = 0; + uint32_t pack_key = 0; + vsi_nn_kernel_tensor_attr_t* input_attr[2] = {NULL}; + vsi_nn_kernel_tensor_attr_t* output_attr[1] = {NULL}; +#define _PACK_SELECT_KEY( hstate_type, fc_type, output_type ) \ + (hstate_type | (fc_type << 8) | (output_type << 16)) + + output = (vsi_nn_kernel_tensor_t)param[3]; + + for (i = 0; i < 2; i++) + { + input_attr[i] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[i] ); + CHECK_PTR_FAIL_GOTO( input_attr[i], "Create tensor attr buffer fail.", final ); + } + + output_attr[0] = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final ); + + if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant ) + { + int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl; + if (srcFixPointPos >= 0) + hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); + else if (srcFixPointPos < 0) + hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos); + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant ) + { + hstate_in_scale = input_attr[0]->asymm.scale; + hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale; + } + + pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype); + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((input_attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (input_attr[0]->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + switch (pack_key) + { + case _PACK_SELECT_KEY(F16, F16, F16): + { + gpu_dp_inst_t uniExtractH4_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertF16_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractH4_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY(U8, F16, F16): + case _PACK_SELECT_KEY(I8, F16, F16): + case _PACK_SELECT_KEY(I16, F16, F16): + { + gpu_dp_inst_t uniExtractH4_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertF16_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractH4_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_scale", &hstate_in_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + for (i = 0; i < 2; i++) + { + if (input_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &input_attr[i] ); + } + } + + if (output_attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &output_attr[0] ); + } + + return status; +} /* _grucell_h_times_activation_r_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t recurrent_activation + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e hstate_dtype; + vsi_nn_kernel_dtype_e fc_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _grucell_h_times_activation_r_kernel_map; + size_t kernel_map_size = _cnt_of_array( _grucell_h_times_activation_r_kernel_map ); + vx_param_description_t * param_def = _grucell_h_times_activation_r_kernel_param_def; + vx_kernel_initialize_f initializer = _grucell_h_times_activation_r_initializer; + + uint32_t key; + uint32_t i; + + hstate_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type ); + fc_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type ); + + key = GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + + status = _query_kernel( kernel, inputs, outputs, recurrent_activation ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( grucell_h_times_activation_r, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c new file mode 100644 index 0000000..0c35aea --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c @@ -0,0 +1,389 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "libnnext/vx_lib_nnext.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +typedef enum _grucell_nn_activation_type_e +{ + SIGMOID = VSI_NN_ACT_SIGMOID, + HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, +}grucell_nn_activation_type_e; + +#define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE "grucell_reset_after_activation" + +// Add kernel hashtable here +#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 )) +#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \ + { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \ +CVIVANTE_NAMESPACE("evis.grucell_reset_after_activation_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \ +_GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] = +{ + // Register kernel here + PACK_KERNEL_MAP( U8, F16, U8, SIGMOID ), + PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ), + PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _grucell_reset_after_activation_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM _cnt_of_array( _grucell_reset_after_activation_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_tensor_t hstate_out = NULL; + vsi_nn_kernel_tensor_t output = NULL; + float hstate_in_scale = 1.0f; + float hstate_in_tail = 0; + float output_scale = 1.0f; + float output_zp = 0; + uint32_t i = 0; + uint32_t pack_key = 0; + vsi_nn_kernel_tensor_attr_t* input_attr[GRUCELL_ACT_IN_CNT] = {NULL}; + vsi_nn_kernel_tensor_attr_t* output_attr[2] = {NULL}; +#define _PACK_SELECT_KEY( hstate_type, fc_type, output_type ) \ + (hstate_type | (fc_type << 8) | (output_type << 16)) + + + output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_OUTPUT]; + hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_H_STATE]; + + for (i = 0; i < GRUCELL_ACT_IN_CNT; i++) + { + input_attr[i] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[i] ); + CHECK_PTR_FAIL_GOTO( input_attr[i], "Create tensor attr buffer fail.", final ); + } + + output_attr[0] = vsi_nn_kernel_tensor_attr_create( output ); + CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final ); + output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out ); + CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final ); + + if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant ) + { + int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl; + if (srcFixPointPos >= 0) + hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); + else if (srcFixPointPos < 0) + hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos); + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant ) + { + hstate_in_scale = input_attr[0]->asymm.scale; + hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale; + } + + if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant ) + { + int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl; + if (srcFixPointPos >= 0) + output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos); + else if (srcFixPointPos < 0) + output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos); + } + else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant ) + { + output_scale = 1.0f / output_attr[0]->asymm.scale; + output_zp = (float)output_attr[0]->asymm.zero_point; + } + + pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype); + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_attr[1]->shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_attr[1]->shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + + switch (pack_key) + { + case _PACK_SELECT_KEY(F16, F16, F16): + { + gpu_dp_inst_t uniExtractH4_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertF16_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractH4_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + case _PACK_SELECT_KEY(U8, F16, U8): + case _PACK_SELECT_KEY(I8, F16, I8): + case _PACK_SELECT_KEY(I16, F16, I16): + { + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvertF16_0_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_scale", &hstate_in_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + break; + default: + break; + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final ); +final: + for (i = 0; i < GRUCELL_ACT_IN_CNT; i++) + { + if (input_attr[i]) + { + vsi_nn_kernel_tensor_attr_release( &input_attr[i] ); + } + } + if (output_attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &output_attr[0] ); + } + + if (output_attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &output_attr[1] ); + } + return status; +} /* _grucell_reset_after_activation_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t recurrent_activation + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e hstate_dtype; + vsi_nn_kernel_dtype_e fc_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _grucell_reset_after_activation_kernel_map; + size_t kernel_map_size = _cnt_of_array( _grucell_reset_after_activation_kernel_map ); + vx_param_description_t * param_def = _grucell_reset_after_activation_kernel_param_def; + vx_kernel_initialize_f initializer = _grucell_reset_after_activation_initializer; + + uint32_t key; + uint32_t i; + + hstate_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type ); + fc_dtype = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type ); + + key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation ); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _grucell_reset_after_activation_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" ); + int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" ); + + if( activation != VSI_NN_ACT_TANH ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, recurrent_activation ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM, + inputs, input_num, outputs, output_num ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( grucell_reset_after_activation, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c index a01c9f4..9ddc0bf 100644 --- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c @@ -1077,7 +1077,8 @@ static vsi_nn_kernel_node_t _setup attr.vtl = TRUE; attr.size[0] = ((shape[0] + 255) / 256) * 4; if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16 - || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16) + || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 + || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16) { attr.size[0] = ((shape[0] + 127) / 128) * 4; } @@ -1137,7 +1138,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U16 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -1200,7 +1201,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U16 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -1244,4 +1245,3 @@ final: __END_DECLS REGISTER_BACKEND_EVIS( instance_norm, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c index 2943617..c7326d4 100644 --- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c @@ -544,7 +544,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U8 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vsi_nn_kernel_node_set_border( node, &border ); VSI_ASSERT( status == VSI_SUCCESS ); diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c index 1de96db..e6ecaa5 100644 --- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c @@ -1305,7 +1305,7 @@ static vsi_nn_kernel_node_t _setup_wh border.constant_value.U16 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -1337,7 +1337,7 @@ static vsi_nn_kernel_node_t _setup_wh border.constant_value.U16 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -1500,7 +1500,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U16 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c index 3a1eb37..f368c97 100644 --- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -1141,7 +1141,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U32 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } if (K % 4 == 0 && N % 4 == 0) { diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c index 2379574..cf540bc 100644 --- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c @@ -825,7 +825,7 @@ static vsi_nn_kernel_node_t _setup CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &node_params[3] ); vsi_nn_kernel_scalar_release( &node_params[4] ); - status = set_constant_border(node, inputs[0]->attr.dtype.zero_point); + status = set_constant_border(node, vsi_nn_get_tensor_zero_point(inputs[0])); CHECK_STATUS(status); } } @@ -844,4 +844,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( moments, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c index dc478f9..2201205 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c @@ -46,14 +46,20 @@ __BEGIN_DECLS #define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toI8") #define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOI16 CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toI16") #define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toF16") +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_HALF_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_gray_half_U8toU8") +#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_FOUR_OVER_THREE_U8TOU8 \ + CVIVANTE_NAMESPACE("evis.pre_process_gray_4over3_U8toU8") #define KERNEL_SOURCE_1 "pre_process_gray", #define KERNEL_SOURCE_2 "pre_process_gray_copy" +#define KERNEL_SOURCE_3 "pre_process_gray_2" typedef enum { COPY = 0, - SCALE + SCALE, + FOUR_OVER_THREE, + HALF } vsi_nn_gray_convert_type_e; #define HASH_PRE_PROCESS_GRAY_KEY(_input0_type, _output_type, _convert_type, _image_2d) \ @@ -70,14 +76,16 @@ static const struct { const char* source_name; } pre_process_gray_map[] = { - TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) - TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) - TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_1) - TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_1) - TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2) - TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8, COPY, KERNEL_SOURCE_2) - TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, COPY, KERNEL_SOURCE_2) - TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, SCALE, KERNEL_SOURCE_1) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, COPY, KERNEL_SOURCE_2) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8, FOUR_OVER_THREE, KERNEL_SOURCE_3) + TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8, HALF, KERNEL_SOURCE_3) }; static vx_param_description_t vxPreProcessGrayKernel_param_def[] = @@ -358,14 +366,150 @@ OnError: attr[0] = NULL; } return status; -} /* _pre_process_gray_copy_initializer() */ +} /* _pre_process_gray_initializer() */ + +DEF_KERNEL_INITIALIZER(_resize_gray_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 2, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + uint32_t width = 0; + uint32_t height = 0; + vsi_bool is_4_over_3 = 0; + vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; + vsi_size_array_t * out_shape = NULL; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + out_shape = attr[1]->shape; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); + + is_4_over_3 = (attr[0]->shape->data[0] * 3 == width * 4) && + (attr[0]->shape->data[1] * 3 == height * 4); + + if (is_4_over_3) + { + shaderParam.global_scale[0] = 16; + shaderParam.global_scale[1] = 4; + shaderParam.global_size[0] = gpu_align_p2((attr[0]->shape->data[0] + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = (attr[0]->shape->data[1] + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1]; + } + else + { + shaderParam.global_scale[0] = 16; + shaderParam.global_scale[1] = 2; + shaderParam.global_size[0] = gpu_align_p2((attr[0]->shape->data[0] + shaderParam.global_scale[0] - 1) + / shaderParam.global_scale[0], 4); + shaderParam.global_size[1] = (attr[0]->shape->data[1] + shaderParam.global_scale[1] - 1) + / shaderParam.global_scale[1]; + } + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + + if (is_4_over_3) + { + gpu_dp_inst_t uniBilinear_4over3_l00_2x8 = {{ + 0x51551551, // TCfg + 0x00000000, // ASelt + 0x04322100, 0xa9087665, // ABin + 0xa2aa2aa2, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff, + 0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinear_4over3_l10_2x8 = {{ + 0x00005515, // TCfg + 0x00000000, // ASelt + 0xfeed0cba, 0x00000000, // ABin + 0x0000aa2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinear_4over3_l01_4x4 = {{ + 0x05555505, // TCfg + 0x04505004, // ASelt + 0x21210000, 0x00443232, // ABin + 0x0aaaaa0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0x5555aaaa, 0x00000000, 0x38e471c7, 0x1c7238e4, + 0x71c738e4, 0x38e41c72, 0x5555aaaa, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinear_4over3_l11_4x4 = {{ + 0x55055555, // TCfg + 0x50045050, // ASelt + 0x76766565, 0xa9a90088, // ABin + 0xaa0aaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0x38e471c7, 0x1c7238e4, 0x71c738e4, 0x38e41c72, + 0x5555aaaa, 0x00000000, 0x38e471c7, 0x1c7238e4 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniBilinear_4over3_l21_4x4 = {{ + 0x55550555, // TCfg + 0x50500450, // ASelt + 0x00ccbaba, 0xfefeeded, // ABin + 0xaaaa0aaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0x71c738e4, 0x38e41c72, 0x5555aaaa, 0x00000000, + 0x38e471c7, 0x1c7238e4, 0x71c738e4, 0x38e41c72 // Constant + }, GPU_DP_TYPE_16 }; + + + status = vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l00_2x8", &uniBilinear_4over3_l00_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l10_2x8", &uniBilinear_4over3_l10_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l01_4x4", &uniBilinear_4over3_l01_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l11_4x4", &uniBilinear_4over3_l11_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l21_4x4", &uniBilinear_4over3_l21_4x4); + + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + + return status; +} /* _resize_gray_initializer() */ static vsi_status _query_kernel ( vsi_nn_tensor_t* const* const inputs, vsi_nn_tensor_t* const* const outputs, vsi_nn_kernel_t* kernel, - const vsi_nn_kernel_param_t * params + const vsi_nn_kernel_param_t * params, + vsi_bool is_no_range_change, + int32_t width, + int32_t height ) { vsi_nn_kernel_dtype_e input0_dtype = U8; @@ -373,40 +517,61 @@ static vsi_status _query_kernel vsi_nn_gray_convert_type_e convert_type = SCALE; vsi_status status = VSI_FAILURE; uint32_t key = 0; - int i = 0; - vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + int32_t i = 0; + vsi_bool is_4_over_3 = FALSE; + vsi_bool is_half_scale = FALSE; + vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + is_4_over_3 = (width * 3 == (int32_t)outputs[0]->attr.size[0] * 4) && + (height * 3 == (int32_t)outputs[0]->attr.size[1] * 4); + is_half_scale = (width == (int32_t)outputs[0]->attr.size[0] * 2) && + (height == (int32_t)outputs[0]->attr.size[1] * 2); input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if(enable_copy) + if (enable_copy) { convert_type = COPY; } else { - convert_type = SCALE; + if (is_no_range_change && is_4_over_3) + { + convert_type = FOUR_OVER_THREE; + } + else if (is_no_range_change && is_half_scale) + { + convert_type = HALF; + } + else + { + convert_type = SCALE; + } } key = HASH_PRE_PROCESS_GRAY_KEY( input0_dtype, output_dtype, convert_type, 0 ); - for( i = 0; i < _cnt_of_array(pre_process_gray_map); i ++ ) + for ( i = 0; i < _cnt_of_array(pre_process_gray_map); i ++ ) { - if( pre_process_gray_map[i].key == key ) + if ( pre_process_gray_map[i].key == key ) { break; } } - if( i < _cnt_of_array(pre_process_gray_map) ) + if ( i < _cnt_of_array(pre_process_gray_map) ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_gray_map[i].function_name ); kernel->info.parameters = vxPreProcessGrayKernel_param_def; kernel->info.numParams = _cnt_of_array( vxPreProcessGrayKernel_param_def ); - if(enable_copy) + if (enable_copy) { kernel->info.initialize = _pre_process_gray_copy_initializer; } + else if (convert_type == FOUR_OVER_THREE || convert_type == HALF) + { + kernel->info.initialize = _resize_gray_initializer; + } else { kernel->info.initialize = _pre_process_gray_initializer; @@ -435,6 +600,11 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_GRAY_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; + int32_t width = vsi_nn_kernel_param_get_int32( params, "width" ); + int32_t height = vsi_nn_kernel_param_get_int32( params, "height" ); + float mean = vsi_nn_kernel_param_get_float32( params, "mean" ); + float scale = vsi_nn_kernel_param_get_float32( params, "scale" ); + vsi_bool is_no_range_change = FALSE; if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) @@ -442,7 +612,16 @@ static vsi_nn_kernel_node_t _setup return NULL; } - status = _query_kernel( inputs, outputs, kernel, params ); + if (width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] && + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 && + outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC && + (float)outputs[0]->attr.dtype.zero_point == mean && + vsi_nn_abs(outputs[0]->attr.dtype.scale - scale) < 1e-8 ) + { + is_no_range_change = TRUE; + } + + status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height ); if( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); @@ -453,8 +632,6 @@ static vsi_nn_kernel_node_t _setup int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); - float mean = vsi_nn_kernel_param_get_float32( params, "mean" ); - float scale = vsi_nn_kernel_param_get_float32( params, "scale" ); /* Pass parameters to node. */ vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_GRAY_PARAM_NUM, @@ -481,4 +658,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( pre_process_gray, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c index 39b9649..e70b58a 100644 --- a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c @@ -277,6 +277,15 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); CHECK_STATUS_FAIL_GOTO(status, final ); } + else + { + inputScale = 1.0f; + input_offset_asymmetric = 0; + + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { @@ -300,6 +309,14 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); CHECK_STATUS_FAIL_GOTO(status, final ); } + else + { + outputScale = 1.0f; + output_offset_asymmetric = 0; + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize ); CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_gpu_config( node, &gpu_param ); diff --git a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c index 7ec74d5..b1149fd 100644 --- a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c @@ -279,6 +279,15 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); CHECK_STATUS_FAIL_GOTO(status, final ); } + else + { + inputScale = 1.0f; + input_offset_asymmetric = 0; + + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { @@ -302,6 +311,15 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); CHECK_STATUS_FAIL_GOTO(status, final ); } + else + { + outputScale = 1.0f; + output_offset_asymmetric = 0; + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + status = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize ); CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_gpu_config( node, &gpu_param ); @@ -426,4 +444,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( reducemin_internal, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c index bbdf29e..6fd1b7d 100644 --- a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c @@ -368,6 +368,15 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); CHECK_STATUS_FAIL_GOTO(status, final ); } + else + { + inputScale = 1.0f; + input_offset_asymmetric = 0; + + status = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { @@ -391,6 +400,15 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); CHECK_STATUS_FAIL_GOTO(status, final ); } + else + { + outputScale = 1.0f; + output_offset_asymmetric = 0; + status = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); final: @@ -508,4 +526,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( reduceprod_internal, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c index 35d2b63..ac72b9f 100644 --- a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c @@ -525,7 +525,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.S32 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -606,4 +606,3 @@ final: __END_DECLS REGISTER_BACKEND_EVIS( repeat, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c new file mode 100644 index 0000000..6e2e6bd --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c @@ -0,0 +1,520 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_dtype_util_prv.h" + +__BEGIN_DECLS + +#define STR(a) #a +// Add kernel hashtable here +#define RESIZE_BILINEAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, H_PIXEL_CENTERS, ALIGN_CORNERS, UP_SCALE ) \ + (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (H_PIXEL_CENTERS << 16) | (ALIGN_CORNERS << 17) | (UP_SCALE << 18)) + +#define BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE( IN_DTYPE, OUT_DTYPE, H_PIXEL_CENTERS, ALIGN_CORNERS, UP_SCALE ) \ + { RESIZE_BILINEAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, H_PIXEL_CENTERS, ALIGN_CORNERS, UP_SCALE ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_nhwc_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_"STR(UP_SCALE)"x_upsample_half_pixel_centers"), \ + "resize_bilinear_nhwc" } + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _resize_bilinear_nhwc_kernel_map[] = +{ + BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 2), + BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 3), + BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 4), +}; + + +/* + * Kernel params + */ +static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _RESIZE_BILINEAR_NHWC_PARAM_NUM _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def ) + +#define SCALAR_ALIGN_CORNERS (2) +#define SCALAR_HALF_PIXEL (3) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_resize_bilinear_nhwc_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_size_array_t * out_shape = NULL; + vsi_size_array_t * in_shape = NULL; + int32_t align_corners = 0; + int32_t half_pixel_centers = 0; + uint32_t in_width; + uint32_t in_height; + uint32_t out_width; + uint32_t out_height; + vsi_bool is_half_pixel_centers = FALSE; + vsi_bool is_2x_up_kernel = FALSE; + vsi_bool is_3x_up_kernel = FALSE; + vsi_bool is_4x_up_kernel = FALSE; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &align_corners); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &half_pixel_centers); + CHECK_STATUS_FAIL_GOTO(status, final ); + + out_shape = output_attr->shape; + in_shape = input_attr->shape; + + in_width = (uint32_t)(in_shape->data[0]); + in_height = (uint32_t)(in_shape->data[1]); + out_width = (uint32_t)(out_shape->data[0]); + out_height = (uint32_t)(out_shape->data[1]); + + is_half_pixel_centers = (!align_corners) && (half_pixel_centers); + + if (is_half_pixel_centers) + { + is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height); + is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height); + is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height); + } + + if (is_2x_up_kernel) + { + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 4; + } + else if (is_4x_up_kernel) + { + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 8; + } + else if (is_3x_up_kernel) + { + gpu_param.global_scale[0] = 30; + gpu_param.global_scale[1] = 6; + } + else + { + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + } + + if (is_2x_up_kernel) + { + gpu_dp_inst_t uniResize_x2_nhwc2_0_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x46194040, 0x3a48829c, 0x4882acca, 0xc4acca3a, 0xbd4e5b50, // BinSelect + 0x00000704, // AccumType, ConstantType, and PostShift + 0x09030301, 0x09030301, 0x03090103, 0x03090103, + 0x09030301, 0x09030301, 0x03090103, 0x03090103 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x2_nhwc2_1_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x4e5b50c4, 0x7c5906bd, 0x5906cdd2, 0x48cdd27c, 0xde569d61, // BinSelect + 0x00000704, // AccumType, ConstantType, and PostShift + 0x09030301, 0x09030301, 0x03090103, 0x03090103, + 0x09030301, 0x09030301, 0x03090103, 0x03090103 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x2_nhwc2_0_4x8", &uniResize_x2_nhwc2_0_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x2_nhwc2_1_4x8", &uniResize_x2_nhwc2_1_4x8); + //status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (is_3x_up_kernel) + { + gpu_dp_inst_t uniResize_x3_nhwc2_l10_4x4 = {{ + 0x05055555, // TCfg + 0x04045050, // ASelt + 0x31312020, 0x00330022, // ABin + 0x0a0aaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39, + 0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l11_4x4 = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0x53534242, 0x53534242, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72, + 0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l12_4x4 = {{ + 0x55550505, // TCfg + 0x50500404, // ASelt + 0x00550044, 0x75756464, // ABin + 0xaaaa0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000, + 0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l13_4x4 = {{ + 0x05055555, // TCfg + 0x04045050, // ASelt + 0x75756464, 0x00770066, // ABin + 0x0a0aaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39, + 0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l14_4x4 = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0x97978686, 0x97978686, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72, + 0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l15_4x4 = {{ + 0x55550505, // TCfg + 0x50500404, // ASelt + 0x00990088, 0xb9b9a8a8, // ABin + 0xaaaa0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000, + 0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l16_4x4 = {{ + 0x05055555, // TCfg + 0x04045050, // ASelt + 0xb9b9a8a8, 0x00bb00aa, // ABin + 0x0a0aaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39, + 0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l17_4x4 = {{ + 0x55555555, // TCfg + 0x50505050, // ASelt + 0xdbdbcaca, 0xdbdbcaca, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72, + 0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39 // Constant + }, GPU_DP_TYPE_16}; + + + gpu_dp_inst_t uniResize_x3_nhwc2_l00_2x8 = {{ + 0x55551155, // TCfg + 0x00000000, // ASelt + 0x03023120, 0x53425342, // ABin + 0xaaaa22aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0xaaaa5555, 0xaaaa5555, 0x0000ffff, 0x0000ffff, + 0x5555aaaa, 0x5555aaaa, 0xaaaa5555, 0xaaaa5555 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l01_2x8 = {{ + 0x11555511, // TCfg + 0x00000000, // ASelt + 0x75640504, 0x07067564, // ABin + 0x22aaaa22, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0x0000ffff, 0x0000ffff, 0x5555aaaa, 0x5555aaaa, + 0xaaaa5555, 0xaaaa5555, 0x0000ffff, 0x0000ffff // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l02_2x8 = {{ + 0x55115555, // TCfg + 0x00000000, // ASelt + 0x97869786, 0xb9a80908, // ABin + 0xaa22aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0x5555aaaa, 0x5555aaaa, 0xaaaa5555, 0xaaaa5555, + 0x0000ffff, 0x0000ffff, 0x5555aaaa, 0x5555aaaa // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x3_nhwc2_l03_2x8 = {{ + 0x00551155, // TCfg + 0x00000000, // ASelt + 0x0b0ab9a8, 0x0000dbca, // ABin + 0x00aa22aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0xaaaa5555, 0xaaaa5555, 0x0000ffff, 0x0000ffff, + 0x5555aaaa, 0x5555aaaa, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l00_2x8", &uniResize_x3_nhwc2_l00_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l01_2x8", &uniResize_x3_nhwc2_l01_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l02_2x8", &uniResize_x3_nhwc2_l02_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l03_2x8", &uniResize_x3_nhwc2_l03_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l10_4x4", &uniResize_x3_nhwc2_l10_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l11_4x4", &uniResize_x3_nhwc2_l11_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l12_4x4", &uniResize_x3_nhwc2_l12_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l13_4x4", &uniResize_x3_nhwc2_l13_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l14_4x4", &uniResize_x3_nhwc2_l14_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l15_4x4", &uniResize_x3_nhwc2_l15_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l16_4x4", &uniResize_x3_nhwc2_l16_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l17_4x4", &uniResize_x3_nhwc2_l17_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (is_4x_up_kernel) + { + gpu_dp_inst_t uniResize_x4_nhwc2_l00_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x46194040, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect + 0x00000706, // AccumType, ConstantType, and PostShift + 0x190f0f09, 0x190f0f09, 0x23051503, 0x23051503, + 0x05230315, 0x05230315, 0x0f19090f, 0x0f19090f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x4_nhwc2_l01_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0xca3a4882, 0x3a4882ac, 0x50c4acca, 0xc4bd4e5b, 0xbd4e5b50, // BinSelect + 0x00000706, // AccumType, ConstantType, and PostShift + 0x190f0f09, 0x190f0f09, 0x23051503, 0x23051503, + 0x05230315, 0x05230315, 0x0f19090f, 0x0f19090f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x4_nhwc2_l10_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x46194040, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect + 0x00000706, // AccumType, ConstantType, and PostShift + 0x23150503, 0x23150503, 0x31070701, 0x31070701, + 0x07310107, 0x07310107, 0x15230305, 0x15230305 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x4_nhwc2_l11_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0xca3a4882, 0x3a4882ac, 0x50c4acca, 0xc4bd4e5b, 0xbd4e5b50, // BinSelect + 0x00000706, // AccumType, ConstantType, and PostShift + 0x23150503, 0x23150503, 0x31070701, 0x31070701, + 0x07310107, 0x07310107, 0x15230305, 0x15230305 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l00_4x8", &uniResize_x4_nhwc2_l00_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l01_4x8", &uniResize_x4_nhwc2_l01_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l10_4x8", &uniResize_x4_nhwc2_l10_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l11_4x8", &uniResize_x4_nhwc2_l11_4x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + VSILOGE("input or output's format is not support"); + status = VSI_FAILURE; + goto final; + } + + gpu_param.global_size[0] = gpu_align_p2((out_width + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_height + \ + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]; + gpu_param.dim = 2; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + + return status; +} /* _resize_bilinear_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t align_corners, + int32_t half_pixel_centers, + uint32_t up_scale + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _resize_bilinear_nhwc_kernel_map; + size_t kernel_map_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_map ); + vx_param_description_t * param_def = _resize_bilinear_nhwc_kernel_param_def; + size_t param_def_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def ); + vx_kernel_initialize_f initializer = _resize_bilinear_nhwc_initializer; + uint32_t key; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + in_dtype = in_dtype == I8 ? U8 : in_dtype; + out_dtype = out_dtype == I8 ? U8 : out_dtype; + + key = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers, align_corners, up_scale ); + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + + if ( i < kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + vsi_bool is_same_type = vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype); + vsi_size_t depth = inputs[0]->attr.size[0]; + float scale_x = (float)outputs[0]->attr.size[1] / (float)inputs[0]->attr.size[1]; + float scale_y = (float)outputs[0]->attr.size[2] / (float)inputs[0]->attr.size[2]; + float up_scale = scale_x == scale_y ? scale_x : 0; + uint32_t rank = inputs[0]->attr.dim_num; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + + if (!is_same_type || depth != 2 || rank < 3 || + (up_scale != 2.0f && up_scale != 3.0f && up_scale != 4.0f)) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, + align_corners, half_pixel_centers, (uint32_t)up_scale); + + shapes[0][0] = depth * inputs[0]->attr.size[1]; + shapes[0][1] = inputs[0]->attr.size[2]; + shapes[0][2] = 1; + shapes[0][3] = inputs[0]->attr.size[3]; + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], rank ); + + shapes[1][0] = depth * outputs[0]->attr.size[1]; + shapes[1][1] = outputs[0]->attr.size[2]; + shapes[1][2] = 1; + shapes[1][3] = outputs[0]->attr.size[3]; + + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[1], rank ); + + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[1], output_num ); + node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + } + } + + vsi_safe_release_tensor(reshape_tensors[0]); + vsi_safe_release_tensor(reshape_tensors[1]); + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( resize_bilinear_nhwc, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c index e4a497a..55af6c0 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c @@ -98,7 +98,9 @@ static const _kernel_map_type scatter_nd_update_map[] = TENSOR_SCATTER_ND_UPDATE_KERNELS(U8, I32, U8, F16, KERNEL_SOURCE_1) TENSOR_SCATTER_ND_UPDATE_KERNELS(I8, I32, I8, F16, KERNEL_SOURCE_1) TENSOR_SCATTER_ND_UPDATE_KERNELS(I16, I32, I16, F16, KERNEL_SOURCE_1) + TENSOR_SCATTER_ND_UPDATE_KERNELS(F16, I32, F16, U8, KERNEL_SOURCE_1) TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, F16, KERNEL_SOURCE_2) + TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, U8, KERNEL_SOURCE_2) }; static const _kernel_map_type scatter_nd_update_reset_map[] = @@ -766,7 +768,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer) status = vsi_nn_kernel_gpu_add_param( node, "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 ); - if (attr[2]->quant != VSI_NN_KERNEL_QUANT_NONE) + if (attr[3]->quant != VSI_NN_KERNEL_QUANT_NONE) { status |= vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 ); diff --git a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c index 2b79fd8..2b9d53e 100644 --- a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c @@ -350,7 +350,7 @@ static vsi_nn_kernel_node_t _setup border.constant_value.U16 = 0; if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) { - border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point; + border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); @@ -363,4 +363,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( space2depth_internal, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index d954dc0..b266a99 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -34,6 +34,7 @@ #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_math.h" +#include "vsi_nn_tensor_util.h" #include "libnnext/vsi_nn_libnnext_resource.h" #if VSI_USE_VXC_BINARY @@ -669,7 +670,6 @@ vsi_nn_kernel_node_t vsi_nn_kernel_create_node return (vsi_nn_kernel_node_t)node; } /* vsi_nn_kernel_create_node() */ - vsi_status vsi_nn_kernel_node_set_border (vsi_nn_kernel_node_t node, vx_border_t* border) @@ -709,11 +709,8 @@ vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_reshape vsi_size_t rank ) { -#ifdef VSI_40BIT_VA_SUPPORT - return (vsi_nn_kernel_tensor_t)vxReshapeTensor((vx_tensor)tensor, shape, rank); -#else - return (vsi_nn_kernel_tensor_t)vxReshapeTensor((vx_tensor)tensor, (vx_int32*)shape, (vx_uint32)rank); -#endif + return (vsi_nn_kernel_tensor_t)vsi_nn_safe_reshape_tensor((vx_tensor)tensor, + (void*)shape, (vsi_size_t)rank, sizeof(shape[0])); } /* vsi_nn_kernel_tensor_reshape() */ void vsi_nn_kernel_tensor_release @@ -925,6 +922,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector else { vsi_nn_kernel_pirority_t default_pirority[] = { + { VSI_NN_KERNEL_TYPE_SP, 5 }, { VSI_NN_KERNEL_TYPE_EVIS, 4 }, { VSI_NN_KERNEL_TYPE_CL, 3 }, { VSI_NN_KERNEL_TYPE_VX, 2 }, @@ -945,20 +943,28 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector { type = selector.pirority[i].kernel_type; - //Skip evis and cl when disable shader + /* Skip evis and cl when disable shader */ if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL) && _check_shader_support(graph) == FALSE) { continue; } - // Skip evis if not support + /* Skip evis if not support */ if( type == VSI_NN_KERNEL_TYPE_EVIS && graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE ) { continue; } + + /* Skip StreamProcesor if not support */ + if( type == VSI_NN_KERNEL_TYPE_SP + && !graph->ctx->config.support_stream_processor ) + { + continue; + } + kernel_func = backend->setup[type]; - // Skip no kernel func + /* Skip no kernel func */ if( NULL == kernel_func ) { continue; @@ -967,7 +973,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector kernel->unique_id = KERNEL_ID_OVXLIB_START + backend->unique_id; node = kernel_func( graph, inputs, input_num, outputs, output_num, params, kernel ); - // If node created, break the loop + /* If node created, break the loop */ if( node ) { VSILOGD("Instance %s node with kernel \"%s\" ", diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c new file mode 100644 index 0000000..b5dfa9e --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c @@ -0,0 +1,303 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include +#include +#include +#include +#include "vsi_nn_context.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include +#include "vsi_nn_error.h" +#include "utils/vsi_nn_dtype_util_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_lut.h" +#include "utils/vsi_nn_dtype_util.h" + +static int32_t _comparator(const void *pa, const void *pb) +{ + vsi_nn_kernel_lut_t a = *(vsi_nn_kernel_lut_t *)pa; + vsi_nn_kernel_lut_t b = *(vsi_nn_kernel_lut_t *)pb; + float diff = a.index - b.index; + + if ( diff > 0) + { + return 1; + } + else if ( diff < 0) + { + return -1; + } + + return 0; +} + +static float exp_eval(float val) +{ + return expf(val); +} + +static float log_eval(float data) +{ + return logf(data); +} + +static float elu_eval(float data, vsi_nn_kernel_lut_params *lut_param) +{ + float alpha = lut_param->params[0]; + return data >=0 ? data : expf(data) * alpha - alpha; +} + +static float neg_eval(float data) +{ + return data * -1.0f; +} + +static float hsigmoid_eval(float data, vsi_nn_kernel_lut_params *lut_param) +{ + float alpha = lut_param->params[0]; + float beta = lut_param->params[1]; + + data = (float)(alpha * data + beta); + data = vsi_nn_clamp(data, 0, 1); + + return data; +} + +static float soft_plus_eval(float data) +{ + return log_eval(exp_eval(data) + 1); +} + +static float mish_eval(float data) +{ + data = (float)(data * tanh(soft_plus_eval(data))); + + return data; +} + +static float erf_eval(float x) +{ + float res = 0; + float tmp = x; + float factorial = 1; /*n!*/ + float x_pow = x; + int32_t one = 1; + int32_t n = 1; + + if (x <= -3) + { + return -1; + } + else if (x >= 3) + { + return 1; + } + + while (vsi_abs(tmp) > 1e-5) + { + res += tmp; + + factorial *= n; + one *= -1; + x_pow *= x * x; + tmp = one / factorial * x_pow / ( 2 * n + 1); + + n ++; + } +#define VSI_MUL2_RSQRTPI (1.1283791670955126f) + + res *= VSI_MUL2_RSQRTPI; + + return res; +} + +static float gelu_eval(float data) +{ + data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f)))); + + return data; +} + +#define VSI_SQRT_2_RCP_PI 0.7978845834732056f +static float hgelu_eval(float data) +{ + float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI * + (data + 0.044715f * data * data * data))))); + + return data * cdf; +} + +static float relu_keras_eval(float val, vsi_nn_kernel_lut_params *lut_param) +{ + float alpha = lut_param->params[0]; + float max = lut_param->params[1]; + float threshold = lut_param->params[2]; + + val = vsi_nn_min(val, max); + val = val < threshold ? alpha * (val - threshold) : val; + return val; +} + +static float clip_eval(float val, vsi_nn_kernel_lut_params *lut_param) +{ + float min = lut_param->params[0]; + float max = lut_param->params[1]; + + return vsi_nn_clamp(val, min, max); +} + +static float square_eval(float x) +{ + return x * x; +} + +static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param) +{ + float result = 0; + + switch (lut_param->act_type) + { + case VSI_NN_KERNEL_LUT_MISH: + result = mish_eval(data); + break; + case VSI_NN_KERNEL_LUT_LOG: + result = log_eval(data); + break; + break; + case VSI_NN_KERNEL_LUT_EXP: + result = exp_eval(data); + break; + break; + case VSI_NN_KERNEL_LUT_ELU: + result = elu_eval(data, lut_param); + break; + break; + case VSI_NN_KERNEL_LUT_NEG: + result = neg_eval(data); + break; + break; + case VSI_NN_KERNEL_LUT_HSIGMOID: + result = hsigmoid_eval(data, lut_param); + break; + break; + case VSI_NN_KERNEL_LUT_SOFT_PLUS: + result = soft_plus_eval(data); + break; + break; + case VSI_NN_KERNEL_LUT_ERF: + result = erf_eval(data); + break; + break; + case VSI_NN_KERNEL_LUT_GELU: + result = gelu_eval(data); + break; + break; + case VSI_NN_KERNEL_LUT_HGELU: + result = hgelu_eval(data); + break; + case VSI_NN_KERNEL_LUT_RELU_KERAS: + result = relu_keras_eval(data, lut_param); + break; + case VSI_NN_KERNEL_LUT_CLIP: + result = clip_eval(data, lut_param); + break; + case VSI_NN_KERNEL_LUT_SQUARE: + result = square_eval(data); + break; + default: + VSILOGE( "unsupported activation function:%d", lut_param->act_type ); + break; + } + + return result; +} + +vsi_status vsi_nn_kernel_lut + ( + vx_lut index_lut, + vx_lut output_lut, + vsi_nn_kernel_lut_params *param + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_lut_t *lut = NULL; + uint32_t i = 0; + float index[VSI_NN_KERNEL_LUT_MAX_SIZE] = {0}; + float value[VSI_NN_KERNEL_LUT_MAX_SIZE] = {0}; + + if (index_lut == NULL || output_lut == NULL || param == NULL) + { + return VSI_FAILURE; + } + + lut = (vsi_nn_kernel_lut_t *)calloc(VSI_NN_KERNEL_LUT_MAX_SIZE, sizeof(vsi_nn_kernel_lut_t)); + CHECK_PTR_FAIL_GOTO( lut, "Create LUT buffer fail.", final ); + + for ( i = 0; i < VSI_NN_KERNEL_LUT_MAX_SIZE; i++) + { + int16_t val = (int16_t)(i << 6); + lut[i].index = fp16_to_fp32(val); + lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param); + } + + for (i = 0x0; i < 0x10; i++) + { + lut[i].index = 0; + lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param); + } + + for (i = 0x1F0; i < 0x200; i++) + { + lut[i].index = VSI_NN_KERNEL_LUT_FP16_MAX; + lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param); + } + + for (i = 0x3F0; i < 0x400; i++) + { + lut[i].index = VSI_NN_KERNEL_LUT_FP16_MIN; + lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param); + } + + qsort(lut, VSI_NN_KERNEL_LUT_MAX_SIZE, sizeof(vsi_nn_kernel_lut_t), _comparator); + + for ( i = 0; i < VSI_NN_KERNEL_LUT_MAX_SIZE; i++) + { + index[i] = lut[i].index; + value[i] = lut[i].val; + } + + status = vxCopyLUT(index_lut, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + status |= vxCopyLUT(output_lut, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); +final: + vsi_nn_safe_free(lut); + + return status; +} diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index 2447239..e3f454a 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -62,10 +62,11 @@ KERNEL_SELECTOR( depthwise_conv1d ) vsi_size_t real_kernel = 0; int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); vsi_nn_kernel_pirority_t pirority[] = { - { VSI_NN_KERNEL_TYPE_VX, 0 }, - { VSI_NN_KERNEL_TYPE_EVIS, 3 }, - { VSI_NN_KERNEL_TYPE_CL, 2 }, - { VSI_NN_KERNEL_TYPE_CPU, 1 }, + { VSI_NN_KERNEL_TYPE_VX, 1 }, + { VSI_NN_KERNEL_TYPE_SP, 0 }, + { VSI_NN_KERNEL_TYPE_EVIS, 4 }, + { VSI_NN_KERNEL_TYPE_CL, 3 }, + { VSI_NN_KERNEL_TYPE_CPU, 2 }, }; dilation = dilation == 0 ? 0 : dilation - 1; real_kernel = (kernel - 1) * dilation + kernel; @@ -94,6 +95,7 @@ static vsi_status _select ) { vsi_nn_kernel_pirority_t pirority[] = { + { VSI_NN_KERNEL_TYPE_SP, 4 }, { VSI_NN_KERNEL_TYPE_VX, 3 }, { VSI_NN_KERNEL_TYPE_EVIS, 2 }, { VSI_NN_KERNEL_TYPE_CL, 1 }, diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c index 15de948..a7cc925 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c @@ -37,81 +37,157 @@ typedef enum MEMORY_ACCESSOR_WRITE_ONLY = 1, } mem_accessor_e; -vsi_status _copy_tensor +vsi_status vsi_nn_kernel_copy_tensor_veiw_patch ( - vsi_nn_kernel_tensor_t tensor, + vx_tensor tensor, const vsi_nn_kernel_tensor_attr_t * attr, - mem_accessor_e accessor, - void * buffer, - size_t buffer_size + void *user_ptr, + vsi_size_t *start, + vsi_size_t *end, + vsi_size_t *stride, + vsi_enum usage, + vsi_enum user_memory_type ) { +#define USE_OPENVX_1_2 + size_t dim,i; + size_t vstart[VSI_NN_MAX_DIM_NUM],vend[VSI_NN_MAX_DIM_NUM],vstride[VSI_NN_MAX_DIM_NUM]; vsi_status status = VSI_FAILURE; - vsi_nn_kernel_tensor_attr_t * internal_attr = NULL; - size_t rank; - size_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; - size_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; - size_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; - vsi_size_t stride2[VSI_NN_MAX_DIM_NUM] = { 0 }; - size_t type_bytes; - size_t total_bytes; - uint32_t i; - - if( !tensor || !buffer || !buffer_size ) + if (NULL == tensor || NULL == user_ptr || NULL == start || NULL == end || NULL == stride) { VSILOGE("Invalid parameter"); return status; } - if( !attr ) + dim = (size_t)attr->shape->size; + for (i = 0; i < dim; i++) { - internal_attr = vsi_nn_kernel_tensor_attr_create( tensor ); - CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr fail.", final ); - attr = internal_attr; + vstart[i] = (size_t)start[i]; + vend[i] = (size_t)end[i]; + vstride[i] = (size_t)stride[i]; } - total_bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr ); - if( total_bytes != (vsi_size_t)buffer_size ) +#ifdef USE_OPENVX_1_2 + +#ifdef VX_TENSOR_STRIDE_X_BITS_SUPPORT { - VSILOGE("Read buffer size mismatch %"VSI_SIZE_T_SPECIFIER" vs %"VSI_SIZE_T_SPECIFIER"", - total_bytes, (vsi_size_t)buffer_size); - goto final; + vx_trensor_addressing addr = NULL; + vx_size dim_sizes[VSI_NN_MAX_DIM_NUM], strides[VSI_NN_MAX_DIM_NUM]; + addr = (vx_trensor_addressing)malloc(sizeof(vx_tensorpatch_addressing_t)); + addr->num_of_dims = (vx_uint32)attr->shape->size; + + for (i = 0; i < dim; i++) + { + strides[i] = (vx_size)vstride[i]; + dim_sizes[i] = (vx_size)attr->shape->data[i]; + } + addr->strides = strides; + addr->dim_sizes = dim_sizes; + if ( attr->dtype == I4 || attr->dtype == U4 ) + { + addr->strides[0] = 0; + addr->stride_x_bits = 4; + } + status = vxCopyTensorPatch2(tensor, dim, vstart, vend, addr,sizeof(vx_tensorpatch_addressing_t), + user_ptr, usage, user_memory_type); + if(addr) + { + free(addr); + addr = NULL; + } + } +#else + status = vxCopyTensorPatch(tensor, dim, vstart, vend, vstride, user_ptr, usage, user_memory_type); +#endif +#else + { + vx_context context = NULL; + vx_tensor_addressing addr = NULL; + size_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_nn_tensor_attr_t t; + + memset(vstart, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM); + memset(vend, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM); + memset(vstride, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM); + status = vsi_nn_vxGetTensorAttr(tensor, &t); + vsi_nn_kernel_tensor_attr_get_stride( attr, stride_size ); + context = vxGetContext((vx_reference)tensor); + if( NULL == context ) + { + VSILOGE("Call vxGetContext fail"); + return status; + } + addr = vxCreateTensorAddressing( context, attr->shape->data, + (vx_uint32*)stride_size, attr->shape->size ); + if( NULL == addr ) + { + VSILOGE("Call vxCreateTensorAddressing fail"); + return status; + } + status = vxCopyTensorPatch_11( tensor, + NULL, + addr, + user_ptr, + usage, + user_memory_type + ); + vxReleaseTensorAddressing( &addr ); + if( VSI_SUCCESS != status ) + { + VSILOGE("Call vxCopyTensorPatch_11 fail"); + return status; + } + } +#endif + return status; +} /* vsi_nn_kernel_copy_tensor_veiw_patch() */ + +vsi_status vsi_nn_kernel_copy_tensor_patch + ( + vsi_nn_kernel_tensor_t tensor, + const vsi_nn_kernel_tensor_attr_t * attr, + mem_accessor_e accessor, + void * user_ptr, + size_t buffer_size + ) +{ + vsi_size_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM]; + vsi_status status = VSI_FAILURE; + uint32_t i; + if (NULL == tensor || NULL == user_ptr) + { + VSILOGE("Invalid parameter"); + return status; } - vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, stride2 ); - for( i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + vsi_nn_kernel_tensor_attr_get_stride( attr, stride ); + memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { - stride[i] = stride2[i]; - } - type_bytes = vsi_nn_kernel_dtype_get_bytes( attr->dtype ); - rank = attr->shape->size; - for( i = 0; i < rank; i++ ) - { - start[i] = 0; - end[i] = attr->shape->data[i]; - stride[i] = stride[i] * type_bytes; + end[i] = attr->shape->data[i]; + if ( attr->dtype != I4 && attr->dtype != U4 ) + { + size_t type_bytes = vsi_nn_kernel_dtype_get_bytes( attr->dtype ); + stride[i] = stride[i] * (vsi_size_t)type_bytes; + } } + switch( accessor ) { case MEMORY_ACCESSOR_READ_ONLY: - status = vxCopyTensorPatch( (vx_tensor)tensor, rank, - start, end, stride, buffer, VX_READ_ONLY, 0); + status = vsi_nn_kernel_copy_tensor_veiw_patch( (vx_tensor)tensor, attr, + user_ptr, start, end, stride, VX_READ_ONLY, 0); break; case MEMORY_ACCESSOR_WRITE_ONLY: - status = vxCopyTensorPatch( (vx_tensor)tensor, rank, - start, end, stride, buffer, VX_WRITE_ONLY, 0); + status = vsi_nn_kernel_copy_tensor_veiw_patch( (vx_tensor)tensor, attr, + user_ptr, start, end, stride, VX_WRITE_ONLY, 0); break; default: VSI_ASSERT( FALSE ); break; } -final: - if( internal_attr ) - { - vsi_nn_kernel_tensor_attr_release( &internal_attr ); - } return status; -} /* _copy_tensor() */ +} /* vsi_nn_kernel_copy_tensor_patch() */ void * vsi_nn_kernel_tensor_create_buffer ( @@ -123,49 +199,76 @@ void * vsi_nn_kernel_tensor_create_buffer vsi_status status = VSI_FAILURE; void * buffer = NULL; void * out_buffer = NULL; + void * tensor_buffer = NULL; + void * new_data = NULL; size_t bytes; size_t float_bytes; size_t tensor_size = 0; vsi_nn_kernel_tensor_attr_t * internal_attr = NULL; - if( !tensor ) + if ( !tensor ) { return NULL; } - if( !attr ) + if ( !attr ) { internal_attr = vsi_nn_kernel_tensor_attr_create( tensor ); CHECK_PTR_FAIL_GOTO( internal_attr, "Create tensor attr fail.", final ); attr = internal_attr; } bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr ); - out_buffer = malloc( bytes ); - CHECK_PTR_FAIL_GOTO( out_buffer, "Out of memory, create buffer fail.", final ); + tensor_buffer = malloc( bytes ); + CHECK_PTR_FAIL_GOTO( tensor_buffer, "Out of memory, create buffer fail.", final ); - status = vsi_nn_kernel_tensor_read( tensor, attr, out_buffer, bytes ); - if( status != VSI_SUCCESS ) + status = vsi_nn_kernel_tensor_read( tensor, attr, tensor_buffer, bytes ); + if ( status != VSI_SUCCESS ) { VSILOGE("Read tensor fail with error \"%s\".", vsi_nn_DescribeStatus(status)); - free( out_buffer ); - out_buffer = NULL; + vsi_nn_safe_free( tensor_buffer ); goto final; } - if( convert_to_float && F32 != attr->dtype ) + if ( attr->dtype == I4 || attr->dtype == U4 ) + { + vsi_size_t dest_size = vsi_nn_kernel_tensor_attr_get_size( attr ); + new_data = (uint8_t*)malloc(dest_size); + if ( !new_data ) + { + VSILOGE("Out of memory, create buffer fail"); + vsi_nn_safe_free( tensor_buffer ); + goto final; + } + CHECK_PTR_FAIL_GOTO( new_data, "Out of memory, create buffer fail.", final ); + status = vsi_nn_kernel_unpack_4bit_data(attr, (uint8_t *)tensor_buffer, (uint8_t *)new_data, attr->dtype); + if ( status != VSI_SUCCESS ) + { + VSILOGE("Read tensor fail with error \"%s\".", vsi_nn_DescribeStatus(status)); + vsi_nn_safe_free( tensor_buffer ); + vsi_nn_safe_free( new_data ); + goto final; + } + vsi_nn_safe_free( tensor_buffer ); + out_buffer = new_data; + } + else + { + out_buffer = tensor_buffer; + } + + if ( convert_to_float && F32 != attr->dtype ) { buffer = out_buffer; tensor_size = vsi_nn_kernel_tensor_attr_get_size( attr ); float_bytes = tensor_size * sizeof(float); out_buffer = malloc( float_bytes ); - if( !out_buffer ) + if ( !out_buffer ) { VSILOGE("Out of memory, create float buffer fail."); - free( buffer ); - buffer = NULL; + vsi_nn_safe_free( buffer ); goto final; } - if( vsi_nn_kernel_tensor_attr_is_quantized( attr ) ) + if ( vsi_nn_kernel_tensor_attr_is_quantized( attr ) ) { switch( attr->quant ) { @@ -202,14 +305,15 @@ void * vsi_nn_kernel_tensor_create_buffer vsi_nn_dtype_convert_dtype_to_float( buffer, tensor_size, attr->dtype, (float*)out_buffer ); } - free( buffer ); + vsi_nn_safe_free( buffer ); } final: - if( internal_attr ) + if ( internal_attr ) { vsi_nn_kernel_tensor_attr_release( &internal_attr ); } + return out_buffer; } /* vsi_nn_kernel_tensor_create_buffer() */ @@ -221,7 +325,7 @@ vsi_status vsi_nn_kernel_tensor_read size_t out_buffer_size ) { - return _copy_tensor( tensor, attr, MEMORY_ACCESSOR_READ_ONLY, + return vsi_nn_kernel_copy_tensor_patch( tensor, attr, MEMORY_ACCESSOR_READ_ONLY, out_buffer, out_buffer_size ); } /* vsi_nn_kernel_tensor_read() */ @@ -235,7 +339,7 @@ vsi_status vsi_nn_kernel_tensor_write { // NOTE: openvx api vxCopyTensorPatch access non-const buffer pointer, // so here we convert const to non-const ptr. - return _copy_tensor( tensor, attr, MEMORY_ACCESSOR_WRITE_ONLY, + return vsi_nn_kernel_copy_tensor_patch( tensor, attr, MEMORY_ACCESSOR_WRITE_ONLY, (void*)buffer, size ); } /* vsi_nn_kernel_tensor_write() */ @@ -252,8 +356,9 @@ vsi_status vsi_nn_kernel_tensor_write_from_float size_t bytes; const void * buffer = NULL; void * internal_buffer = NULL; + void * internal_buffer0 = NULL; size_t tensor_size = 0; - if( !attr ) + if ( !attr ) { internal_attr = vsi_nn_kernel_tensor_attr_create( tensor ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr fail.", final ); @@ -261,30 +366,41 @@ vsi_status vsi_nn_kernel_tensor_write_from_float } bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr ); tensor_size = vsi_nn_kernel_tensor_attr_get_size( attr ); - if( tensor_size != size ) + if ( tensor_size != size ) { VSILOGE("Tensor and buffer size mismatch %d vs %d", tensor_size, size); goto final; } + if ( attr->dtype == I4 || attr->dtype == U4 ) + { + vsi_size_t sz = 0; + sz = vsi_nn_kernel_tensor_attr_get_size( attr ); + internal_buffer0 = malloc( sz ); + } + else + { + internal_buffer0 = malloc( bytes ); + internal_buffer = internal_buffer0; + } + if( attr->dtype != F32 ) { - internal_buffer = malloc( bytes ); - CHECK_PTR_FAIL_GOTO( internal_buffer, "Create buffer fail.", final ); - if( vsi_nn_kernel_tensor_attr_is_quantized( attr ) ) + CHECK_PTR_FAIL_GOTO( internal_buffer0, "Create buffer fail.", final ); + if ( vsi_nn_kernel_tensor_attr_is_quantized( attr ) ) { switch( attr->quant ) { case VSI_NN_KERNEL_QUANT_DFP: vsi_nn_dtype_convert_float_to_quantize_dfp( float_buffer, size, attr->dtype, - attr->dfp.fl, internal_buffer ); + attr->dfp.fl, internal_buffer0 ); break; case VSI_NN_KERNEL_QUANT_ASYMM: vsi_nn_dtype_convert_float_to_quantize_asymm( float_buffer, size, attr->dtype, attr->asymm.scale, attr->asymm.zero_point, - internal_buffer ); + internal_buffer0 ); break; case VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL: vsi_nn_dtype_convert_float_to_quantize_symm_perchannel( @@ -295,13 +411,19 @@ vsi_status vsi_nn_kernel_tensor_write_from_float attr->asymm_v.zero_point->data, attr->asymm_v.zero_point->size, attr->asymm_v.channel_dim, - internal_buffer ); + internal_buffer0 ); break; default: VSILOGE("Donot support quantize type %d", attr->quant); VSI_ASSERT( FALSE ); break; } + + if ( attr->dtype == I4 || attr->dtype == U4 ) + { + internal_buffer = malloc( bytes ); + status = vsi_nn_kernel_pack_4bit_data(attr, (uint8_t*)internal_buffer0, (uint8_t*)internal_buffer); + } } else { @@ -316,14 +438,16 @@ vsi_status vsi_nn_kernel_tensor_write_from_float } status = vsi_nn_kernel_tensor_write( tensor, attr, buffer, bytes ); final: - if( internal_attr ) + if ( internal_attr ) { vsi_nn_kernel_tensor_attr_release( &internal_attr ); } - if( internal_buffer ) + if ( attr->dtype == I4 || attr->dtype == U4 ) { - free( internal_buffer ); + vsi_nn_safe_free(internal_buffer0); } + vsi_nn_safe_free(internal_buffer); + return status; } /* vsi_nn_kernel_tensor_write_from_float() */ @@ -381,6 +505,9 @@ vsi_status vsi_nn_kernel_scalar_get_dtype return status; \ } +DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_int4, + vsi_nn_kernel_scalar_write_int4, + int8_t, I4 ) DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_int8, vsi_nn_kernel_scalar_write_int8, int8_t, I8 ) @@ -413,7 +540,6 @@ static void _convert_tensor_attr_to_vx_tensor_param memset( p, 0, sizeof( vx_tensor_create_params_t ) ); p->num_of_dims = (uint32_t)attr->shape->size; - p->sizes = attr->shape->data; #define MAP_TYPE( var, src_type, dst_type ) \ case src_type: \ var = dst_type; \ @@ -421,6 +547,8 @@ static void _convert_tensor_attr_to_vx_tensor_param switch( attr->dtype ) { + MAP_TYPE( p->data_format, U4, VSI_NN_TYPE_UINT4 ); + MAP_TYPE( p->data_format, I4, VSI_NN_TYPE_INT4 ); MAP_TYPE( p->data_format, I8, VSI_NN_TYPE_INT8 ); MAP_TYPE( p->data_format, I16, VSI_NN_TYPE_INT16 ); MAP_TYPE( p->data_format, I32, VSI_NN_TYPE_INT32 ); @@ -479,8 +607,27 @@ vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_create { vsi_nn_kernel_tensor_t tensor = NULL; vx_tensor_create_params_t params; + vx_size size_vxsize[VSI_NN_MAX_DIM_NUM] = {0}; + vx_uint32 size_u32[VSI_NN_MAX_DIM_NUM] = {0}; + size_t i = 0; _convert_tensor_attr_to_vx_tensor_param( ¶ms, attr ); + //convert attr->shape->data to correct data type + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_vxsize[i] = -1 == attr->shape->data[i] ? -1 : (vx_size)attr->shape->data[i]; + } + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_u32[i] = -1 == attr->shape->data[i] ? -1 : (vx_uint32)attr->shape->data[i]; + } +#ifdef VSI_40BIT_VA_SUPPORT + params.sizes = size_vxsize; + (void)size_u32; +#else + params.sizes = size_u32; + (void)size_vxsize; +#endif if( is_virtual ) { tensor = (vsi_nn_kernel_tensor_t)vxCreateVirtualTensor2( diff --git a/src/tim/vx/internal/src/kernel/vx/clip_vx.c b/src/tim/vx/internal/src/kernel/vx/clip_vx.c deleted file mode 100644 index 3c4ab45..0000000 --- a/src/tim/vx/internal/src/kernel/vx/clip_vx.c +++ /dev/null @@ -1,196 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include "vsi_nn_types.h" -#include "vsi_nn_tensor.h" -#include "vsi_nn_node.h" -#include "vsi_nn_log.h" -#include "vsi_nn_prv.h" -#include -#include "utils/vsi_nn_dtype_util_prv.h" -#include "vsi_nn_tensor_util.h" -#include "kernel/vsi_nn_kernel.h" - -typedef struct _sort_lut_s -{ - float index; - float val; -} sort_lut; - -static float clip_eval(float val, float min, float max) -{ - return vsi_nn_clamp(val, min, max); -} - -#ifdef VX_USER_LOOKUP_TABLE_SUPPORT -static int32_t _lut_comparator(const void *pa, const void *pb) -{ - sort_lut a = *(sort_lut *)pa; - sort_lut b = *(sort_lut *)pb; - float diff = a.index - b.index; - if ( diff > 0 ) - { - return 1; - } - else if ( diff < 0 ) - { - return -1; - } - - return 0; -} - -static void _set_table_lookup(float func(float, float, float), float *index, float *value, float min, float max) -{ -#define VSI_NN_MAX_LUT_SIZE (1024) -#define FLT16_MAX (57344) -#define FLT16_MIN (-57344) - uint32_t i = 0; - sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - int16_t val = (int16_t)(i << 6); - lut[i].index = fp16_to_fp32(val); - lut[i].val = func(lut[i].index, min, max); - } - - for (i = 0x0; i < 0x10; i++) - { - lut[i].index = 0; - lut[i].val = func(lut[i].index, min, max); - } - - for (i = 0x1F0; i < 0x200; i++) - { - lut[i].index = FLT16_MAX; - lut[i].val = func(lut[i].index, min, max); - } - - for (i = 0x3F0; i < 0x400; i++) - { - lut[i].index = FLT16_MIN; - lut[i].val = func(lut[i].index, min, max); - } - - qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - index[i] = lut[i].index; - value[i] = lut[i].val; - } - - vsi_nn_safe_free(lut); - -#undef VSI_NN_MAX_LUT_SIZE -#undef FLT16_MIN -#undef FLT16_MAX -} -#endif - -static vsi_nn_kernel_node_t _setup - ( - vsi_nn_graph_t * graph, - vsi_nn_tensor_t ** inputs, - size_t input_num, - vsi_nn_tensor_t ** outputs, - size_t output_num, - const vsi_nn_kernel_param_t * params, - vsi_nn_kernel_t * kernel, - float func(float, float, float) - ) -{ -#ifdef VX_USER_LOOKUP_TABLE_SUPPORT - vx_lut lut1 = NULL; - vx_lut lut2 = NULL; - vx_node node = NULL; - float min = vsi_nn_kernel_param_get_float32( params, "min_value" ); - float max = vsi_nn_kernel_param_get_float32( params, "max_value" ); - float index[1024] = {0}; - float value[1024] = {0}; - - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) - { - return NULL; - } - - _set_table_lookup(func, index, value, min, max); - - lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); - lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); - if( NULL == lut1 || NULL == lut2 ) - { - VSILOGE("create lut object fail."); - goto OnError; - } - - vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); - - node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); - if( NULL == node ) - { - VSILOGE("Call vxTensorTableLookupLayer fail."); - goto OnError; - } - -OnError: - if (lut1) - { - vxReleaseLUT(&lut1); - lut1 = NULL; - } - if (lut2) - { - vxReleaseLUT(&lut2); - lut2 = NULL; - } - return (vsi_nn_kernel_node_t)node; -#else - return NULL; -#endif -} /* _setup() */ - -#define REGISTER_CLIP_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \ - static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ - ( \ - vsi_nn_graph_t * graph, \ - vsi_nn_tensor_t ** inputs, \ - size_t input_num, \ - vsi_nn_tensor_t ** outputs, \ - size_t output_num, \ - const vsi_nn_kernel_param_t * params, \ - vsi_nn_kernel_t * kernel \ - ) \ - { \ - return _setup(graph, inputs, input_num, outputs, output_num, \ - params, kernel, UNARY_FUNC); \ - } \ - REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) - -REGISTER_CLIP_OPENVX_KERNEL( clip, clip_eval ) - -#undef REGISTER_CLIP_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c index 2bb2248..8cc0794 100644 --- a/src/tim/vx/internal/src/kernel/vx/convolutional.c +++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c @@ -120,6 +120,67 @@ static vsi_bool _build_vx_deconv2d_param return TRUE; } /* _build_vx_deconv2d_param() */ +#if VX_CONV_3D_API_SUPPORT +static vsi_bool _build_vx_conv3d_param + ( + vx_nn_convolution_3d_params_t * param, + int32_t stride_d, int32_t stride_h, int32_t stride_w, + int32_t pad_d_front, int32_t pad_d_end, + int32_t pad_h_front, int32_t pad_h_end, + int32_t pad_w_front, int32_t pad_w_end, + int32_t dilation_d, int32_t dilation_h, int32_t dilation_w, + int32_t multiplier, + vsi_enum overflow_policy, vsi_enum rounding_policy, + vsi_enum down_scale_size_rounding + ) +{ + VSI_ASSERT( stride_d > 0 ); + VSI_ASSERT( stride_h > 0 ); + VSI_ASSERT( stride_w > 0 ); + VSI_ASSERT( pad_d_front >= 0 ); + VSI_ASSERT( pad_d_end >= 0 ); + VSI_ASSERT( pad_h_front >= 0 ); + VSI_ASSERT( pad_h_end >= 0 ); + VSI_ASSERT( pad_w_front >= 0 ); + VSI_ASSERT( pad_w_end >= 0 ); + VSI_ASSERT( dilation_d >= 0 ); + VSI_ASSERT( dilation_h >= 0 ); + VSI_ASSERT( dilation_w >= 0 ); + VSI_ASSERT( multiplier >= 0 ); + + param->padding_d_front = (uint32_t)pad_d_front; + param->padding_d_rear = (uint32_t)pad_d_end; + param->padding_h_top = (uint32_t)pad_h_front; + param->padding_h_bottom = (uint32_t)pad_h_end; + param->padding_w_left = (uint32_t)pad_w_front; + param->padding_w_right = (uint32_t)pad_w_end; + + if( dilation_d > 0 ) + { + param->dilation_d = (uint32_t)(dilation_d - 1); + } + if( dilation_h > 0 ) + { + param->dilation_h = (uint32_t)(dilation_h - 1); + } + if( dilation_w > 0 ) + { + param->dilation_w = (uint32_t)(dilation_w - 1); + } + + param->overflow_policy = (vx_enum)overflow_policy; + param->rounding_policy = (vx_enum)rounding_policy; + param->down_scale_size_rounding = (vx_enum)down_scale_size_rounding; + param->depth_multiplier = multiplier; + + param->stride_w = (uint32_t)stride_w; + param->stride_h = (uint32_t)stride_h; + param->stride_d = (uint32_t)stride_d; + + return TRUE; +} /* _build_vx_conv2d_param() */ +#endif + static vx_tensor _expand_tensor_dim ( vx_tensor tensor, vsi_ssize_t * shape, size_t rank, vsi_ssize_t expand_dim ) { @@ -149,12 +210,7 @@ static vx_tensor _expand_tensor_dim { new_shape[cnt] = 1; } -#ifdef VSI_40BIT_VA_SUPPORT - return vxReshapeTensor( tensor, (vsi_size_t*)new_shape, rank + 1 ); -#else - return vxReshapeTensor( tensor, (int32_t*)new_shape, (uint32_t)(rank + 1) ); -#endif - + return vsi_nn_safe_reshape_tensor( tensor, (void*)new_shape, (vsi_size_t)(rank + 1) , sizeof(new_shape[0])); } /* _expand_tensor_dim() */ @@ -181,7 +237,6 @@ static vx_tensor _expand_tensor_dim vsi_nn_kernel_t * kernel \ ) - REGISTER_CONV_OPENVX_KERNEL( conv1d ) { vx_node node = NULL; @@ -191,11 +246,11 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) _build_vx_conv2d_param( &vxparam, - vsi_nn_kernel_param_get_int32(params, "stride"), 1, + 1, vsi_nn_kernel_param_get_int32(params, "stride"), + 0, 0, vsi_nn_kernel_param_get_int32(params, "pad_front"), vsi_nn_kernel_param_get_int32(params, "pad_end"), - 0,0, - vsi_nn_kernel_param_get_int32(params, "dilation"), 1, + 1, vsi_nn_kernel_param_get_int32(params, "dilation"), 0, vsi_nn_kernel_param_get_int32(params, "overflow_policy"), vsi_nn_kernel_param_get_int32(params, "rounding_policy"), @@ -203,12 +258,12 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) ); temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, - (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); + (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 ); CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) { temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, - (vsi_ssize_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); + (vsi_ssize_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 1 ); CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); } else @@ -222,8 +277,9 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) memcpy(&attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t)); - attr.size[0] = 1; - for (i = 1; i <= inputs[1]->attr.dim_num; i++) + attr.size[0] = inputs[1]->attr.size[0]; + attr.size[1] = 1; + for (i = 2; i <= inputs[1]->attr.dim_num; i++) { attr.size[i] = inputs[1]->attr.size[i - 1]; } @@ -235,7 +291,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) } temp_tensors[2] = _expand_tensor_dim( outputs[0]->t, - (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); + (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 1 ); CHECK_PTR_FAIL_GOTO( temp_tensors[2], "Expand output dim fail.", final ); node = vxConvolutionLayer( graph->g, @@ -266,11 +322,11 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) _build_vx_conv2d_param( &vxparam, - vsi_nn_kernel_param_get_int32(params, "stride"), 1, + 1, vsi_nn_kernel_param_get_int32(params, "stride"), + 0, 0, vsi_nn_kernel_param_get_int32(params, "pad_front"), vsi_nn_kernel_param_get_int32(params, "pad_end"), - 0,0, - vsi_nn_kernel_param_get_int32(params, "dilation"), 1, + 1, vsi_nn_kernel_param_get_int32(params, "dilation"), vsi_nn_kernel_param_get_int32(params, "multiplier"), vsi_nn_kernel_param_get_int32(params, "overflow_policy"), vsi_nn_kernel_param_get_int32(params, "rounding_policy"), @@ -278,26 +334,23 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) ); temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, - (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); + (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 ); CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) { vsi_size_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; uint32_t new_w_rank = 4; - new_w_shape[0] = 1; - new_w_shape[1] = inputs[1]->attr.size[0]; + new_w_shape[0] = inputs[1]->attr.size[0]; + new_w_shape[1] = 1; new_w_shape[2] = 1; for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++) { new_w_shape[2] *= inputs[1]->attr.size[i]; } new_w_shape[3] = 1; -#ifdef VSI_40BIT_VA_SUPPORT - temp_tensors[1] = vxReshapeTensor( inputs[1]->t, new_w_shape, new_w_rank ); -#else - temp_tensors[1] = vxReshapeTensor( inputs[1]->t, (vx_int32*)new_w_shape, (vx_uint32)new_w_rank ); -#endif + temp_tensors[1] = vsi_nn_safe_reshape_tensor( inputs[1]->t, + (void*)new_w_shape, (vsi_size_t)new_w_rank, sizeof(new_w_shape[0]) ); CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); } @@ -312,8 +365,8 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) memcpy(&attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t)); - attr.size[0] = 1; - attr.size[1] = inputs[1]->attr.size[0]; + attr.size[0] = inputs[1]->attr.size[0]; + attr.size[1] = 1; attr.size[2] = 1; for (i = 1; i < inputs[1]->attr.dim_num; i++) { @@ -329,7 +382,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) } temp_tensors[2] = _expand_tensor_dim( outputs[0]->t, - (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); + (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 1 ); CHECK_PTR_FAIL_GOTO( temp_tensors[2], "Expand output dim fail.", final ); if( need_explicit_padding ) @@ -404,7 +457,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv2d ) inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL, (vx_nn_convolution_params_t *)&vxparam, sizeof( vx_nn_convolution_params_ext2_t ), - outputs[2]->t + outputs[0]->t ); return (vsi_nn_kernel_node_t)node; @@ -435,7 +488,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv2d ) inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL, (vx_nn_convolution_params_t *)&vxparam, sizeof( vx_nn_convolution_params_ext2_t ), - outputs[2]->t + outputs[0]->t ); return (vsi_nn_kernel_node_t)node; @@ -486,4 +539,41 @@ final: return (vsi_nn_kernel_node_t)node; } /* deconvolution1d*/ +REGISTER_CONV_OPENVX_KERNEL( conv3d ) +{ + vx_node node = NULL; +#if VX_CONV_3D_API_SUPPORT + vx_nn_convolution_3d_params_t vxparam; + memset(&vxparam, 0, sizeof(vxparam)); + + _build_vx_conv3d_param( + &vxparam, + vsi_nn_kernel_param_get_int32(params, "stride_d"), + vsi_nn_kernel_param_get_int32(params, "stride_h"), + vsi_nn_kernel_param_get_int32(params, "stride_w"), + vsi_nn_kernel_param_get_int32(params, "pad_front"), + vsi_nn_kernel_param_get_int32(params, "pad_end"), + vsi_nn_kernel_param_get_int32(params, "pad_top"), + vsi_nn_kernel_param_get_int32(params, "pad_bottom"), + vsi_nn_kernel_param_get_int32(params, "pad_left"), + vsi_nn_kernel_param_get_int32(params, "pad_right"), + vsi_nn_kernel_param_get_int32(params, "dilation_d"), + vsi_nn_kernel_param_get_int32(params, "dilation_h"), + vsi_nn_kernel_param_get_int32(params, "dilation_w"), + vsi_nn_kernel_param_get_int32(params, "depth_multiplier"), + vsi_nn_kernel_param_get_int32(params, "overflow_policy"), + vsi_nn_kernel_param_get_int32(params, "rounding_policy"), + vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding") + ); + + node = vxConv3dLayer( graph->g, + inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL, + &vxparam, + sizeof( vxparam), + outputs[0]->t + ); +#endif + return (vsi_nn_kernel_node_t)node; +} /* depthwise_conv2d*/ + #undef REGISTER_CONV_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c index 492f8f7..30b1257 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -1,6 +1,6 @@ /**************************************************************************** * -* Copyright (c) 2020 Vivante Corporation +* Copyright (c) 2021 Vivante Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -30,173 +30,9 @@ #include #include "utils/vsi_nn_dtype_util_prv.h" #include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" - -typedef struct _sort_lut_s -{ - float index; - float val; -} sort_lut; - -static float exp_eval(float val, float alpha) -{ - return expf(val); -} - -static float log_eval(float data, float alpha) -{ - return logf(data); -} - -static float elu_eval(float data, float alpha) -{ - return data >=0 ? data : expf(data) * alpha - alpha; -} - -static float neg_eval(float data, float alpha) -{ - return data * -1.0f; -} - -static float hsigmoid_eval(float data, float alpha) -{ - data = (float)(0.2 * data + 0.5); - data = vsi_nn_clamp(data, 0, 1); - - return data; -} - -static float soft_plus_eval(float data, float alpha) -{ - return log_eval(exp_eval(data, alpha) + 1, alpha); -} - -static float mish_eval(float data, float alpha) -{ - data = (float)(data * tanh(soft_plus_eval(data, alpha))); - - return data; -} - -static float erf_eval(float x) -{ - float res = 0; - float tmp = x; - float factorial = 1; /*n!*/ - float x_pow = x; - int32_t one = 1; - int32_t n = 1; - - if (x <= -3) - { - return -1; - } - else if (x >= 3) - { - return 1; - } - - while (vsi_abs(tmp) > 1e-5) - { - res += tmp; - - factorial *= n; - one *= -1; - x_pow *= x * x; - tmp = one / factorial * x_pow / ( 2 * n + 1); - - n ++; - } -#define VSI_MUL2_RSQRTPI (1.1283791670955126f) - - res *= VSI_MUL2_RSQRTPI; - - return res; -} - -static float gelu_eval(float data, float alpha) -{ - data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f)))); - - return data; -} - - -#define VSI_SQRT_2_RCP_PI 0.7978845834732056f -static float hgelu_eval(float data, float alpha) -{ - float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI * - (data + 0.044715f * data * data * data))))); - - return data * cdf; -} - -#ifdef VX_USER_LOOKUP_TABLE_SUPPORT -static int32_t _lut_comparator(const void *pa, const void *pb) -{ - sort_lut a = *(sort_lut *)pa; - sort_lut b = *(sort_lut *)pb; - float diff = a.index - b.index; - if ( diff > 0 ) - { - return 1; - } - else if ( diff < 0 ) - { - return -1; - } - - return 0; -} - -static void _set_unary_table_lookup(float func(float, float), float *index, float *value, float alpha) -{ -#define VSI_NN_MAX_LUT_SIZE (1024) -#define FLT16_MAX (57344) -#define FLT16_MIN (-57344) - uint32_t i = 0; - sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - int16_t val = (int16_t)(i << 6); - lut[i].index = fp16_to_fp32(val); - lut[i].val = func(lut[i].index, alpha); - } - - for (i = 0x0; i < 0x10; i++) - { - lut[i].index = 0; - lut[i].val = func(lut[i].index, alpha); - } - - for (i = 0x1F0; i < 0x200; i++) - { - lut[i].index = FLT16_MAX; - lut[i].val = func(lut[i].index, alpha); - } - - for (i = 0x3F0; i < 0x400; i++) - { - lut[i].index = FLT16_MIN; - lut[i].val = func(lut[i].index, alpha); - } - - qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - index[i] = lut[i].index; - value[i] = lut[i].val; - } - - vsi_nn_safe_free(lut); - -#undef VSI_NN_MAX_LUT_SIZE -#undef FLT16_MIN -#undef FLT16_MAX -} -#endif +#include "kernel/vsi_nn_kernel_lut.h" static vsi_nn_kernel_node_t _setup ( @@ -207,16 +43,33 @@ static vsi_nn_kernel_node_t _setup size_t output_num, const vsi_nn_kernel_param_t * params, vsi_nn_kernel_t * kernel, - float func(float, float) + vsi_enum lut_type ) { #ifdef VX_USER_LOOKUP_TABLE_SUPPORT vx_lut lut1 = NULL; vx_lut lut2 = NULL; vx_node node = NULL; - float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); - float index[1024] = {0}; - float value[1024] = {0}; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_lut_params lut_param; + + lut_param.act_type = lut_type; + if (lut_type == VSI_NN_KERNEL_LUT_RELU_KERAS) + { + lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "alpha" ); + lut_param.params[1] = vsi_nn_kernel_param_get_float32( params, "max_value" ); + lut_param.params[2] = vsi_nn_kernel_param_get_float32( params, "threshold" ); + } + else if (lut_type == VSI_NN_KERNEL_LUT_CLIP) + { + lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "min_value" ); + lut_param.params[1] = vsi_nn_kernel_param_get_float32( params, "max_value" ); + } + else if (lut_type == VSI_NN_KERNEL_LUT_ELU || lut_type == VSI_NN_KERNEL_LUT_HSIGMOID) + { + lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "alpha" ); + lut_param.params[1] = vsi_nn_kernel_param_get_float32( params, "beta" ); + } if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) @@ -224,27 +77,25 @@ static vsi_nn_kernel_node_t _setup return NULL; } - _set_unary_table_lookup(func, index, value, alpha); - - lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); - lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE); + lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE); if( NULL == lut1 || NULL == lut2 ) { VSILOGE("create lut object fail."); - goto OnError; + goto final; } - vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + status = vsi_nn_kernel_lut(lut1, lut2, &lut_param); + CHECK_STATUS_FAIL_GOTO(status, final); node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); - if( NULL == node ) + if ( NULL == node ) { VSILOGW("Call vxTensorTableLookupLayer fail."); - goto OnError; + goto final; } -OnError: +final: if (lut1) { vxReleaseLUT(&lut1); @@ -262,7 +113,7 @@ OnError: #endif } /* _setup() */ -#define REGISTER_ELTWISE_UNARY_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \ +#define REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \ static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ ( \ vsi_nn_graph_t * graph, \ @@ -279,14 +130,136 @@ OnError: } \ REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) -REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( mish, mish_eval ) -//REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( exp, exp_eval ) -REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( log, log_eval ) -REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( elu, elu_eval ) -REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( neg, neg_eval ) -REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_sigmoid, hsigmoid_eval ) -REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( gelu, gelu_eval ) -REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_gelu, hgelu_eval ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( mish, VSI_NN_KERNEL_LUT_MISH ) +//REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( exp, VSI_NN_KERNEL_LUT_EXP ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( log, VSI_NN_KERNEL_LUT_LOG ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( elu, VSI_NN_KERNEL_LUT_ELU ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( neg, VSI_NN_KERNEL_LUT_NEG ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_sigmoid, VSI_NN_KERNEL_LUT_HSIGMOID ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( gelu, VSI_NN_KERNEL_LUT_GELU ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_gelu, VSI_NN_KERNEL_LUT_HGELU ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf, VSI_NN_KERNEL_LUT_ERF ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras, VSI_NN_KERNEL_LUT_RELU_KERAS ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip, VSI_NN_KERNEL_LUT_CLIP ) + +#undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL + +#define REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( abs ) +{ + vx_node node = NULL; + vsi_size_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t dims = 0; + vx_tensor input = NULL, input0 = NULL; + vx_tensor output = NULL, output0 = NULL; + + if (inputs[0]->attr.dim_num > 4) + { + input_size[0] = vsi_nn_GetElementNum(inputs[0]) / + inputs[0]->attr.size[inputs[0]->attr.dim_num - 1]; + input_size[1] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 1]; + dims = 2; +#ifdef VSI_40BIT_VA_SUPPORT + input = vxReshapeTensor(inputs[0]->t, input_size, dims); + output = vxReshapeTensor(outputs[0]->t, input_size, dims); +#else + input = vxReshapeTensor(inputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims); + output = vxReshapeTensor(outputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims); +#endif + input0 = input; + output0 = output; + } + else + { + input0 = inputs[0]->t; + output0 = outputs[0]->t; + } + + node = vxLeakyReluLayer( + graph->g, + input0, + -1, + output0 + ); + + if (input) vxReleaseTensor(&input); + if (output) vxReleaseTensor(&output); + + return (vsi_nn_kernel_node_t)node; +} /* abs() */ + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( linear ) +{ + vx_node node = NULL; + float a_v = vsi_nn_kernel_param_get_float32( params, "a_v" ); + float b_v = vsi_nn_kernel_param_get_float32( params, "b_v" ); + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LINEAR, + a_v, + b_v, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* linear() */ + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sigmoid ) +{ + vx_node node = NULL; + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LOGISTIC, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* sigmoid() */ + +REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( tanh ) +{ + vx_node node = NULL; + float scale_a = vsi_nn_kernel_param_get_float32( params, "scale_a" ); + float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b" ); + + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HYPERBOLIC_TAN, + scale_a, + scale_b, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +} /* tanh() */ #undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL - diff --git a/src/tim/vx/internal/src/kernel/vx/erf_vx.c b/src/tim/vx/internal/src/kernel/vx/erf_vx.c deleted file mode 100644 index f33fa23..0000000 --- a/src/tim/vx/internal/src/kernel/vx/erf_vx.c +++ /dev/null @@ -1,217 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include "vsi_nn_types.h" -#include "vsi_nn_tensor.h" -#include "vsi_nn_node.h" -#include "vsi_nn_log.h" -#include "vsi_nn_prv.h" -#include -#include "utils/vsi_nn_dtype_util_prv.h" -#include "vsi_nn_tensor_util.h" -#include "kernel/vsi_nn_kernel.h" - -typedef struct _sort_lut_s -{ - float index; - float val; -} sort_lut; - -static float erf_eval(float _x) -{ - float x = vsi_clamp(_x, -2, 2); - float res = 0; - float tmp = x; - float factorial = 1; /*n!*/ - float x_pow = x; - int32_t one = 1; - int32_t n = 1; - - while (vsi_abs(tmp) > 1e-5) - { - res += tmp; - - factorial *= n; - one *= -1; - x_pow *= x * x; - tmp = one / factorial * x_pow / ( 2 * n + 1); - - n ++; - } -#define MUL2_RSQRTPI (1.1283791670955126f) - - res *= MUL2_RSQRTPI; - - return res; -} - -#ifdef VX_USER_LOOKUP_TABLE_SUPPORT -static int32_t _lut_comparator(const void *pa, const void *pb) -{ - sort_lut a = *(sort_lut *)pa; - sort_lut b = *(sort_lut *)pb; - float diff = a.index - b.index; - if ( diff > 0 ) - { - return 1; - } - else if ( diff < 0 ) - { - return -1; - } - - return 0; -} - -static void _set_table_lookup(float func(float), float *index, float *value) -{ -#define VSI_NN_MAX_LUT_SIZE (1024) -#define FLT16_MAX (57344) -#define FLT16_MIN (-57344) - uint32_t i = 0; - sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - int16_t val = (int16_t)(i << 6); - lut[i].index = fp16_to_fp32(val); - lut[i].val = func(lut[i].index); - } - - for (i = 0x0; i < 0x10; i++) - { - lut[i].index = 0; - lut[i].val = func(lut[i].index); - } - - for (i = 0x1F0; i < 0x200; i++) - { - lut[i].index = FLT16_MAX; - lut[i].val = func(lut[i].index); - } - - for (i = 0x3F0; i < 0x400; i++) - { - lut[i].index = FLT16_MIN; - lut[i].val = func(lut[i].index); - } - - qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - index[i] = lut[i].index; - value[i] = lut[i].val; - } - - vsi_nn_safe_free(lut); - -#undef VSI_NN_MAX_LUT_SIZE -#undef FLT16_MIN -#undef FLT16_MAX -} -#endif - -static vsi_nn_kernel_node_t _setup - ( - vsi_nn_graph_t * graph, - vsi_nn_tensor_t ** inputs, - size_t input_num, - vsi_nn_tensor_t ** outputs, - size_t output_num, - const vsi_nn_kernel_param_t * params, - vsi_nn_kernel_t * kernel, - float func(float) - ) -{ -#ifdef VX_USER_LOOKUP_TABLE_SUPPORT - vx_lut lut1 = NULL; - vx_lut lut2 = NULL; - vx_node node = NULL; - float index[1024] = {0}; - float value[1024] = {0}; - - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) - { - return NULL; - } - - _set_table_lookup(func, index, value); - - lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); - lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); - if( NULL == lut1 || NULL == lut2 ) - { - VSILOGE("create lut object fail."); - goto OnError; - } - - vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); - - node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); - if( NULL == node ) - { - VSILOGE("Call vxTensorTableLookupLayer fail."); - goto OnError; - } - -OnError: - if (lut1) - { - vxReleaseLUT(&lut1); - lut1 = NULL; - } - if (lut2) - { - vxReleaseLUT(&lut2); - lut2 = NULL; - } - return (vsi_nn_kernel_node_t)node; -#else - return NULL; -#endif -} /* _setup() */ - -#define REGISTER_CLIP_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \ - static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ - ( \ - vsi_nn_graph_t * graph, \ - vsi_nn_tensor_t ** inputs, \ - size_t input_num, \ - vsi_nn_tensor_t ** outputs, \ - size_t output_num, \ - const vsi_nn_kernel_param_t * params, \ - vsi_nn_kernel_t * kernel \ - ) \ - { \ - return _setup(graph, inputs, input_num, outputs, output_num, \ - params, kernel, UNARY_FUNC); \ - } \ - REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) - -REGISTER_CLIP_OPENVX_KERNEL( erf, erf_eval ) - -#undef REGISTER_CLIP_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c new file mode 100644 index 0000000..5133dab --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c @@ -0,0 +1,127 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#define REGISTER_L2_NORMALIZE_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_L2_NORMALIZE_OPENVX_KERNEL( l2_norm ) +{ + vx_node node = NULL; + int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis"); +#ifdef VX_L2NORM_AXIS_PARAMETER_SUPPORT + vx_nn_l2norm_params_t param; + + param.axis = axis; + + node = vxL2NormalizeLayer2( + graph->g, + inputs[0]->t, + ¶m, + sizeof(vx_nn_l2norm_params_t), + outputs[0]->t + ); +#else + uint32_t i = 0; + uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t innerSize = 1; + uint32_t outerSize = 1; + uint32_t axisSize = 1; + vx_tensor vx_input = NULL; + vx_tensor vx_output = NULL; + vx_tensor input = inputs[0]->t; + vx_tensor output = outputs[0]->t; + + if (axis != 2) + { + axisSize = inputs[0]->attr.size[axis]; + + for (i = 0; i < (uint32_t)axis; i++) + { + innerSize *= inputs[0]->attr.size[i]; + } + + for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++) + { + outerSize *= inputs[0]->attr.size[i]; + } + + sizes[0] = innerSize; + sizes[1] = 1; + sizes[2] = axisSize; + sizes[3] = outerSize; + + vx_input = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + vx_output = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + + input = vx_input; + output = vx_output; + } + + node = vxL2NormalizeLayer( + graph->g, + input, + output + ); + + if (vx_input) vxReleaseTensor(&vx_input); + if (vx_output) vxReleaseTensor(&vx_output); +#endif + + if( NULL == node ) + { + VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)"); + } + + return (vsi_nn_kernel_node_t)node; +} /* l2_norm() */ + +#undef REGISTER_L2_NORMALIZE_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c deleted file mode 100644 index 9c5b0cb..0000000 --- a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c +++ /dev/null @@ -1,200 +0,0 @@ -/**************************************************************************** -* -* Copyright (c) 2020 Vivante Corporation -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -* DEALINGS IN THE SOFTWARE. -* -*****************************************************************************/ - -#include "vsi_nn_types.h" -#include "vsi_nn_tensor.h" -#include "vsi_nn_node.h" -#include "vsi_nn_log.h" -#include "vsi_nn_prv.h" -#include -#include "utils/vsi_nn_dtype_util_prv.h" -#include "vsi_nn_tensor_util.h" -#include "kernel/vsi_nn_kernel.h" - -typedef struct _sort_lut_s -{ - float index; - float val; -} sort_lut; - -static float relu_keras_eval(float val, float alpha, float threshold, float max) -{ - val = vsi_nn_min(val, max); - val = val < threshold ? alpha * (val - threshold) : val; - return val; -} - -#ifdef VX_USER_LOOKUP_TABLE_SUPPORT -static int32_t _lut_comparator(const void *pa, const void *pb) -{ - sort_lut a = *(sort_lut *)pa; - sort_lut b = *(sort_lut *)pb; - float diff = a.index - b.index; - if ( diff > 0 ) - { - return 1; - } - else if ( diff < 0 ) - { - return -1; - } - - return 0; -} - -static void _set_table_lookup(float func(float, float, float, float), - float *index, float *value, float alpha, float threshold, float max) -{ -#define VSI_NN_MAX_LUT_SIZE (1024) -#define FLT16_MAX (57344) -#define FLT16_MIN (-57344) - uint32_t i = 0; - sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - int16_t val = (int16_t)(i << 6); - lut[i].index = fp16_to_fp32(val); - lut[i].val = func(lut[i].index, alpha, threshold, max); - } - - for (i = 0x0; i < 0x10; i++) - { - lut[i].index = 0; - lut[i].val = func(lut[i].index, alpha, threshold, max); - } - - for (i = 0x1F0; i < 0x200; i++) - { - lut[i].index = FLT16_MAX; - lut[i].val = func(lut[i].index, alpha, threshold, max); - } - - for (i = 0x3F0; i < 0x400; i++) - { - lut[i].index = FLT16_MIN; - lut[i].val = func(lut[i].index, alpha, threshold, max); - } - - qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - index[i] = lut[i].index; - value[i] = lut[i].val; - } - - vsi_nn_safe_free(lut); - -#undef VSI_NN_MAX_LUT_SIZE -#undef FLT16_MIN -#undef FLT16_MAX -} -#endif - -static vsi_nn_kernel_node_t _setup - ( - vsi_nn_graph_t * graph, - vsi_nn_tensor_t ** inputs, - size_t input_num, - vsi_nn_tensor_t ** outputs, - size_t output_num, - const vsi_nn_kernel_param_t * params, - vsi_nn_kernel_t * kernel, - float func(float, float, float, float) - ) -{ -#ifdef VX_USER_LOOKUP_TABLE_SUPPORT - vx_lut lut1 = NULL; - vx_lut lut2 = NULL; - vx_node node = NULL; - float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); - float max = vsi_nn_kernel_param_get_float32( params, "max_value" ); - float threshold = vsi_nn_kernel_param_get_float32( params, "threshold" ); - float index[1024] = {0}; - float value[1024] = {0}; - - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) - { - return NULL; - } - - _set_table_lookup(func, index, value, alpha, threshold, max); - - lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); - lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); - if( NULL == lut1 || NULL == lut2 ) - { - VSILOGE("create lut object fail."); - goto OnError; - } - - vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); - - node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); - if( NULL == node ) - { - VSILOGW("Call vxTensorTableLookupLayer fail."); - goto OnError; - } - -OnError: - if (lut1) - { - vxReleaseLUT(&lut1); - lut1 = NULL; - } - if (lut2) - { - vxReleaseLUT(&lut2); - lut2 = NULL; - } - return (vsi_nn_kernel_node_t)node; -#else - return NULL; -#endif -} /* _setup() */ - -#define REGISTER_KERAS_RELU_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \ - static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ - ( \ - vsi_nn_graph_t * graph, \ - vsi_nn_tensor_t ** inputs, \ - size_t input_num, \ - vsi_nn_tensor_t ** outputs, \ - size_t output_num, \ - const vsi_nn_kernel_param_t * params, \ - vsi_nn_kernel_t * kernel \ - ) \ - { \ - return _setup(graph, inputs, input_num, outputs, output_num, \ - params, kernel, UNARY_FUNC); \ - } \ - REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) - -REGISTER_KERAS_RELU_OPENVX_KERNEL( relu_keras, relu_keras_eval ) - -#undef REGISTER_KERAS_RELU_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/softmax_vx.c b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c new file mode 100644 index 0000000..f097fbb --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c @@ -0,0 +1,122 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +#define REGISTER_SOFTMAX_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_SOFTMAX_OPENVX_KERNEL( softmax ) +{ + vx_node node = NULL; + float beta = vsi_nn_kernel_param_get_float32(params, "beta"); + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + uint32_t rank_in = 0; + int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis"); + int32_t new_axis = 0; + size_t size = sizeof(vx_nn_softmax_params_t); +#ifdef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT + vx_nn_softmax_params_ext_t paramExt; + vx_nn_softmax_params_t *param = (vx_nn_softmax_params_t *)¶mExt; + paramExt.base.beta = beta; + paramExt.axis = axis; + size = sizeof(vx_nn_softmax_params_ext_t); +#else + vx_nn_softmax_params_t base; + vx_nn_softmax_params_t *param = &base; + + memset(&base, 0, sizeof(vx_nn_softmax_params_t)); + base.beta = beta; +#endif + + vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rank_in, &new_axis); + + if (new_axis == 1) + { + int32_t i = 0; + new_axis ++; + rank_in ++; + for (i = rank_in - 1; i > 1; i--) + { + shapes[0][i] = shapes[0][i - 1]; + } + shapes[0][1] = 1; + } + +#ifdef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT + paramExt.axis = new_axis; +#endif + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], rank_in ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[0], rank_in ); + + node = vxSoftmaxLayer2( graph->g, + reshape_tensors[0]->t, + param, + size, + reshape_tensors[1]->t); + if( NULL == node ) + { + VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)"); + } + + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + + return (vsi_nn_kernel_node_t)node; +} /* softmax() */ + +#undef REGISTER_SOFTMAX_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/square_vx.c b/src/tim/vx/internal/src/kernel/vx/square_vx.c index 839890b..572737c 100644 --- a/src/tim/vx/internal/src/kernel/vx/square_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/square_vx.c @@ -1,6 +1,6 @@ /**************************************************************************** * -* Copyright (c) 2020 Vivante Corporation +* Copyright (c) 2021 Vivante Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -30,85 +30,9 @@ #include #include "utils/vsi_nn_dtype_util_prv.h" #include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" - -typedef struct _sort_lut_s -{ - float index; - float val; -} sort_lut; - -static float square_eval(float x) -{ - return x * x; -} - -#ifdef VX_USER_LOOKUP_TABLE_SUPPORT -static int32_t _lut_comparator(const void *pa, const void *pb) -{ - sort_lut a = *(sort_lut *)pa; - sort_lut b = *(sort_lut *)pb; - float diff = a.index - b.index; - if ( diff > 0 ) - { - return 1; - } - else if ( diff < 0 ) - { - return -1; - } - - return 0; -} - -static void _set_table_lookup(float func(float), float *index, float *value) -{ -#define VSI_NN_MAX_LUT_SIZE (1024) -#define FLT16_MAX (57344) -#define FLT16_MIN (-57344) - uint32_t i = 0; - sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - int16_t val = (int16_t)(i << 6); - lut[i].index = fp16_to_fp32(val); - lut[i].val = func(lut[i].index); - } - - for (i = 0x0; i < 0x10; i++) - { - lut[i].index = 0; - lut[i].val = func(lut[i].index); - } - - for (i = 0x1F0; i < 0x200; i++) - { - lut[i].index = FLT16_MAX; - lut[i].val = func(lut[i].index); - } - - for (i = 0x3F0; i < 0x400; i++) - { - lut[i].index = FLT16_MIN; - lut[i].val = func(lut[i].index); - } - - qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); - - for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) - { - index[i] = lut[i].index; - value[i] = lut[i].val; - } - - vsi_nn_safe_free(lut); - -#undef VSI_NN_MAX_LUT_SIZE -#undef FLT16_MIN -#undef FLT16_MAX -} -#endif +#include "kernel/vsi_nn_kernel_lut.h" static vsi_nn_kernel_node_t _setup ( @@ -118,16 +42,15 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t ** outputs, size_t output_num, const vsi_nn_kernel_param_t * params, - vsi_nn_kernel_t * kernel, - float func(float) + vsi_nn_kernel_t * kernel ) { vx_node node = NULL; #ifdef VX_USER_LOOKUP_TABLE_SUPPORT vx_lut lut1 = NULL; vx_lut lut2 = NULL; - float index[1024] = {0}; - float value[1024] = {0}; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_lut_params lut_param; if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) @@ -135,21 +58,21 @@ static vsi_nn_kernel_node_t _setup return NULL; } - _set_table_lookup(func, index, value); + lut_param.act_type = VSI_NN_KERNEL_LUT_SQUARE; - lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); - lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE); + lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE); if( NULL == lut1 || NULL == lut2 ) { VSILOGE("create lut object fail."); - goto OnError; + goto final; } - vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); - vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + status = vsi_nn_kernel_lut(lut1, lut2, &lut_param); + CHECK_STATUS_FAIL_GOTO(status, final); node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); - if( NULL == node ) + if ( NULL == node ) { node = vxActivationLayer( graph->g, @@ -161,7 +84,7 @@ static vsi_nn_kernel_node_t _setup ); } -OnError: +final: if (lut1) { vxReleaseLUT(&lut1); @@ -187,7 +110,7 @@ OnError: #endif } /* _setup() */ -#define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME, ACT_FUNC) \ +#define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME) \ static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ ( \ vsi_nn_graph_t * graph, \ @@ -200,10 +123,10 @@ OnError: ) \ { \ return _setup(graph, inputs, input_num, outputs, output_num, \ - params, kernel, ACT_FUNC); \ + params, kernel); \ } \ REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) -REGISTER_SQUARE_OPENVX_KERNEL( square, square_eval ) +REGISTER_SQUARE_OPENVX_KERNEL( square ) #undef REGISTER_SQUARE_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl index 34668c1..2177669 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl @@ -56,13 +56,14 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride { int8 desc; int2 strides; + _viv_asm(COPY, desc, input, sizeof(desc)); + #if (USE_40BITS_VA==0) strides.x = desc.s1; strides.y = desc.s4; #else _viv_asm(GET_IMAGE_STRIDE, strides, input); #endif - _viv_asm(COPY, desc, input, sizeof(desc)); uint address = as_uint(desc.s0); Tensor t = diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl index bb31c02..5b90eb1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl @@ -1,12 +1,11 @@ - -float eltwise_unary_sin(float x, float alpha) +float eltwise_unary_sin(float x, float alpha, float beta) { return native_sin(x); } #define logE (1.44269502f) #define twoLogE (logE * 2.0f) -float eltwise_unary_exp(float x, float alpha) +float eltwise_unary_exp(float x, float alpha, float beta) { x *= logE; x = exp2(x); @@ -14,13 +13,13 @@ float eltwise_unary_exp(float x, float alpha) } #define rlogE (0.693147182f) -float eltwise_unary_log(float x, float alpha) +float eltwise_unary_log(float x, float alpha, float beta) { x = log2(x); return x * rlogE; } -float eltwise_unary_elu(float val, float alpha) +float eltwise_unary_elu(float val, float alpha, float beta) { float x = val * logE; x = exp2(x) * alpha - alpha; @@ -28,14 +27,14 @@ float eltwise_unary_elu(float val, float alpha) return val < 0 ? x : val; } -float eltwise_unary_neg(float x, float alpha) +float eltwise_unary_neg(float x, float alpha, float beta) { return x * -1; } -float eltwise_unary_hard_sigmoid(float x, float alpha) +float eltwise_unary_hard_sigmoid(float x, float alpha, float beta) { - x = 0.2 * x + 0.5; + x = alpha * x + beta; x = clamp(x, 0, 1); return x; } @@ -57,14 +56,14 @@ float _tanh(float x, float alpha) return (2 * x - 1); } -float eltwise_unary_mish(float x, float alpha) +float eltwise_unary_mish(float x, float alpha, float beta) { float y = _softrelu(x, alpha); x = x * _tanh(y, alpha); return x; } -float eltwise_unary_round(float x, float alpha) +float eltwise_unary_round(float x, float alpha, float beta) { return convert_float(convert_int_rte(x)); } @@ -98,7 +97,7 @@ float erf_eval(float x) return res * MUL2_RSQRTPI; } #define RSQRT2 (0.70710678118654752440084436210485f) -float eltwise_unary_gelu(float x, float alpha) +float eltwise_unary_gelu(float x, float alpha, float beta) { x = 0.5f * x * (1 + erf_eval(x * RSQRT2)); @@ -106,7 +105,7 @@ float eltwise_unary_gelu(float x, float alpha) } #define SQRT_2_RCP_PI 0.7978845834732056f -float eltwise_unary_hard_gelu(float x, float alpha) +float eltwise_unary_hard_gelu(float x, float alpha, float beta) { float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI * (x + 0.044715f * x * x * x), 0); @@ -122,7 +121,8 @@ __kernel void func_name##_F32toF32 \ float inputTail, \ float outputScale, \ float outputZP, \ - float alpha \ + float alpha, \ + float beta \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ @@ -130,7 +130,7 @@ __kernel void func_name##_F32toF32 \ float4 src = read_imagef(input, coord); \ \ float4 dst = 0; \ - dst.x = eltwise_unary_##func_name(src.x, alpha); \ + dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \ \ write_imagef(output, coord, dst.xxxx); \ } @@ -154,7 +154,8 @@ __kernel void func_name##_F32toF32_2D \ float inputTail, \ float outputScale, \ float outputZP, \ - float alpha \ + float alpha, \ + float beta \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ @@ -162,7 +163,7 @@ __kernel void func_name##_F32toF32_2D \ float4 src = read_imagef(input, coord); \ \ float4 dst = 0; \ - dst.x = eltwise_unary_##func_name(src.x, alpha); \ + dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \ \ write_imagef(output, coord, dst.xxxx); \ } @@ -186,7 +187,8 @@ __kernel void func_name##_U8toU8 \ float inputTail, \ float outputScale, \ float outputZP, \ - float alpha \ + float alpha, \ + float beta \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ @@ -194,7 +196,7 @@ __kernel void func_name##_U8toU8 \ uint4 src = read_imageui(input, coord); \ float4 data = convert_float4(src) * inputScale - inputTail; \ \ - data.x = eltwise_unary_##func_name(data.x, alpha); \ + data.x = eltwise_unary_##func_name(data.x, alpha, beta); \ uint4 dst = convert_uint4(data * outputScale + outputZP); \ \ write_imageui(output, coord, dst); \ @@ -219,7 +221,8 @@ __kernel void func_name##_U8toU8_2D \ float inputTail, \ float outputScale, \ float outputZP, \ - float alpha \ + float alpha, \ + float beta \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ @@ -227,7 +230,7 @@ __kernel void func_name##_U8toU8_2D \ uint4 src = read_imageui(input, coord); \ float4 data = convert_float4(src) * inputScale - inputTail; \ \ - data.x = eltwise_unary_##func_name(data.x, alpha); \ + data.x = eltwise_unary_##func_name(data.x, alpha, beta); \ uint4 dst = convert_uint4(data * outputScale + outputZP); \ \ write_imageui(output, coord, dst); \ @@ -251,7 +254,8 @@ __kernel void neg_I32toI32 float inputTail, float outputScale, float outputZP, - float alpha + float alpha, + float beta ) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); @@ -270,7 +274,8 @@ __kernel void neg_I32toI32_2D float inputTail, float outputScale, float outputZP, - float alpha + float alpha, + float beta ) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl index 746a06e..64f6775 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl @@ -66,7 +66,10 @@ __kernel void floordiv_I32I32toU8( int4 src1; READ_IMAGEI_2DARRAY(src0, input, coord); READ_IMAGEI_2DARRAY(src1, input1, coord); - uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail); + float4 in0 = convert_float4(src0) * input0Scale + input0Tail; + float4 in1 = convert_float4(src1) * input1Scale + input1Tail; + float4 out = floor(in0 / in1) * outputScale + outputTail; + uint4 dst = convert_uint4(out); write_imageui(output, coord, dst); } @@ -84,7 +87,10 @@ __kernel void floordiv_I32I32toU8_2D( int2 coord = (int2)(get_global_id(0), get_global_id(1)); int4 src0 = read_imagei(input, coord); int4 src1 = read_imagei(input1, coord); - uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail); + float4 in0 = convert_float4(src0) * input0Scale + input0Tail; + float4 in1 = convert_float4(src1) * input1Scale + input1Tail; + float4 out = floor(in0 / in1) * outputScale + outputTail; + uint4 dst = convert_uint4(out); write_imageui(output, coord, dst); } diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl new file mode 100644 index 0000000..dd2e562 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl @@ -0,0 +1,123 @@ +#define logE (1.44269502f) +#define twoLogE (logE * 2.0f) + +float sigmoid(float x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float hard_sigmoid(float x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float tanh_func(float x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \ +__kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out, \ + float input_scale, float input_tail, float output_scale, float output_zp) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \ + \ + h_tm = h_tm * input_scale + input_tail; \ + float4 h = h0 + h1; \ + float4 z = z0 + z1; \ + z.x = act_func(z.x); \ + h = tanh_func(h.x); \ + float4 dst = (1 - z ) * h + z * h_tm; \ + dst = dst * output_scale + output_zp; \ + uint4 result = convert_uint4_sat_rte(dst); \ + write_imageui(output, coord_in.xy, result); \ + write_imageui(hstate_out, coord_in.xy, result); \ +} +GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid) +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) + +#define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \ +__kernel void grucell_activation_z_h_F32_F32toF32_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out, \ + float input_scale, float input_tail, float output_scale, float output_zp) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \ + float4 h_tm = read_imagef(hstate_in, coord_in.xy); \ + \ + float4 h = h0 + h1; \ + float4 z = z0 + z1; \ + z.x = act_func(z.x); \ + h = tanh_func(h.x); \ + float4 dst = (1 - z ) * h + z * h_tm; \ + write_imagef(output, coord_in.xy, dst); \ + write_imagef(hstate_out, coord_in.xy, dst); \ +} + +GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid) +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) + +#define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \ +__kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out, \ + float input_scale, float input_tail, float output_scale, float output_zp) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \ + \ + h_tm = h_tm * input_scale + input_tail; \ + float4 h = h0 + h1; \ + float4 z = z0 + z1; \ + z.x = act_func(z.x); \ + h = tanh_func(h.x); \ + float4 dst = (1 - z ) * h + z * h_tm; \ + dst = dst * output_scale + output_zp; \ + int4 result = convert_int4_sat_rte(dst); \ + write_imagei(output, coord_in.xy, result); \ + write_imagei(hstate_out, coord_in.xy, result); \ +} +GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid) +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_h_times_activation_r.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_h_times_activation_r.cl new file mode 100644 index 0000000..e36024f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_h_times_activation_r.cl @@ -0,0 +1,87 @@ +#define logE (1.44269502f) +#define twoLogE (logE * 2.0f) + +float sigmoid(float x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float hard_sigmoid(float x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} + +#define GRUCELL_H_TIMES_R_U8_F32_F32(act_name, act_func) \ +__kernel void grucell_h_times_activation_r_U8_F32toF32_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t hstate_r_conv, \ + __write_only image2d_t output, \ + float input_scale, float input_tail) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \ + float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \ + \ + float4 r = r0 + r1; \ + r.x = act_func(r.x); \ + h_tm = h_tm * input_scale + input_tail; \ + float4 r_times_h = r * h_tm; \ + write_imagef(output, coord_in.xy, r_times_h); \ +} +GRUCELL_H_TIMES_R_U8_F32_F32(SIGMOID, sigmoid) +//GRUCELL_H_TIMES_R_U8_F32_F32(HARD_SIGMOID, hard_sigmoid) + +#define GRUCELL_H_TIMES_R_F32_F32_F32(act_name, act_func) \ +__kernel void grucell_h_times_activation_r_F32_F32toF32_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t hstate_r_conv, \ + __write_only image2d_t output, \ + float input_scale, float input_tail) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \ + float4 h_tm = read_imagef(hstate_in, coord_in.xy); \ + \ + float4 r = r0 + r1; \ + r.x = act_func(r.x); \ + float4 r_times_h = r * h_tm; \ + write_imagef(output, coord_in.xy, r_times_h); \ +} + +GRUCELL_H_TIMES_R_F32_F32_F32(SIGMOID, sigmoid) +//GRUCELL_H_TIMES_R_F32_F32_F32(HARD_SIGMOID, hard_sigmoid) + +#define GRUCELL_H_TIMES_R_I32_F32_F32(act_name, act_func) \ +__kernel void grucell_h_times_activation_r_I32_F32toI32_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t hstate_r_conv, \ + __write_only image2d_t output, \ + float input_scale, float input_tail) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \ + float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \ + \ + float4 r = r0 + r1; \ + r.x = act_func(r.x); \ + h_tm = h_tm * input_scale + input_tail; \ + float4 r_times_h = r * h_tm; \ + write_imagef(output, coord_in.xy, r_times_h); \ +} +GRUCELL_H_TIMES_R_I32_F32_F32(SIGMOID, sigmoid) +//GRUCELL_H_TIMES_R_I32_F32_F32(HARD_SIGMOID, hard_sigmoid) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl new file mode 100644 index 0000000..a47b32d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl @@ -0,0 +1,144 @@ +#define logE (1.44269502f) +#define twoLogE (logE * 2.0f) + +float sigmoid(float x) +{ + x *= -logE; + x = 1 + exp2(x); + return 1 / x; +} +float hard_sigmoid(float x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float tanh_func(float x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1 / x; + return 2 * x - 1; +} + + +#define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \ +__kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_r_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out, \ + float input_scale, float input_tail, float output_scale, float output_zp) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \ + float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \ + \ + float4 r = r0 + r1; \ + r.x = act_func(r.x); \ + h_tm = h_tm * input_scale + input_tail; \ + float4 r_times_h = r * h1; \ + float4 h = h0 + r_times_h; \ + float4 z = z0 + z1; \ + z.x = act_func(z.x); \ + h = tanh_func(h.x); \ + float4 dst = (1 - z ) * h + z * h_tm; \ + dst = dst * output_scale + output_zp; \ + uint4 result = convert_uint4_sat_rte(dst); \ + write_imageui(output, coord_in.xy, result); \ + write_imageui(hstate_out, coord_in.xy, result); \ +} +GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid) +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) + +#define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \ +__kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_r_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out, \ + float input_scale, float input_tail, float output_scale, float output_zp) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \ + float4 h_tm = read_imagef(hstate_in, coord_in.xy); \ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \ + \ + float4 r = r0 + r1; \ + r.x = act_func(r.x); \ + float4 r_times_h = r * h1; \ + float4 h = h0 + r_times_h; \ + float4 z = z0 + z1; \ + z.x = act_func(z.x); \ + h = tanh_func(h.x); \ + float4 dst = (1 - z ) * h + z * h_tm; \ + write_imagef(output, coord_in.xy, dst); \ + write_imagef(hstate_out, coord_in.xy, dst); \ +} + +GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid) +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) + +#define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \ +__kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_r_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out, \ + float input_scale, float input_tail, float output_scale, float output_zp) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + float4 src0, src1, src2, src3; \ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \ + float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \ + \ + float4 r = r0 + r1; \ + r.x = act_func(r.x); \ + h_tm = h_tm * input_scale + input_tail; \ + float4 r_times_h = r * h1; \ + float4 h = h0 + r_times_h; \ + float4 z = z0 + z1; \ + z.x = act_func(z.x); \ + h = tanh_func(h.x); \ + float4 dst = (1 - z ) * h + z * h_tm; \ + dst = dst * output_scale + output_zp; \ + int4 result = convert_int4_sat_rte(dst); \ + write_imagei(output, coord_in.xy, result); \ + write_imagei(hstate_out, coord_in.xy, result); \ +} +GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid) +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx index dea29d2..8a56bb3 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx @@ -1,6 +1,7 @@ #include "cl_viv_vx_ext.h" _viv_uniform float alpha; +_viv_uniform float beta; float4 eltwise_unary_sin(float4 x) { @@ -38,7 +39,7 @@ float4 eltwise_unary_neg(float4 x) float4 eltwise_unary_hard_sigmoid(float4 x) { - x = 0.2 * x + 0.5; + x = alpha * x + beta; x = clamp(x, 0, 1); return x; } @@ -136,7 +137,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ int type, \ - float _alpha \ + float _alpha, \ + float _beta \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ @@ -285,7 +287,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8; __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ int type, \ - float _alpha \ + float _alpha, \ + float _beta \ ) \ { \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx index 6da7605..3faa1f5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx @@ -1,6 +1,7 @@ #include "cl_viv_vx_ext.h" _viv_uniform float alpha; +_viv_uniform float beta; float4 eltwise_unary_sin(float4 x) { @@ -38,7 +39,7 @@ float4 eltwise_unary_neg(float4 x) float4 eltwise_unary_hard_sigmoid(float4 x) { - x = 0.2 * x + 0.5; + x = alpha * x + beta; x = clamp(x, 0, 1); return x; } @@ -136,7 +137,8 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \ __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ int type, \ - float _alpha \ + float _alpha, \ + float _beta \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ @@ -284,7 +286,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8; __read_only image2d_array_t input, \ __write_only image2d_array_t output, \ int type, \ - float _alpha \ + float _alpha, \ + float _beta \ ) \ { \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/extra_ending.vx b/src/tim/vx/internal/src/libnnext/ops/vx/extra_ending.vx new file mode 100644 index 0000000..52f51e5 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/extra_ending.vx @@ -0,0 +1,65 @@ +#include "cl_viv_vx_ext.h" + +__kernel void extra_ending_I16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 data; + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void extra_ending_F16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 data; + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void extra_ending_I8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_char8 data; + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void extra_ending_U8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_uchar8 data; + VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx new file mode 100644 index 0000000..cb00ac9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx @@ -0,0 +1,174 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; + +_viv_uniform float outputScale; +_viv_uniform int output_ZP; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, + float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_short8 src0; + vxc_half8 in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, + float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_short8 src0; + vxc_half8 in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + vxc_float4 norm; + norm = scale_vari * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = scale_vari * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, + float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_short8 src0; + vxc_half8 in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_uchar16 outval; + vxc_int4 tmpVal0, tmpVal1; + float alpha = outputScale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = alpha * tmpData1 + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, + float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_short8 src0; + vxc_half8 in_h; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_uchar16 outval; + vxc_int4 tmpVal0, tmpVal1; + float alpha = outputScale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + + _viv_asm(COPY, in_h, src0, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertEndInt16Fp32_4x4); + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = alpha * tmpData1 + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx new file mode 100644 index 0000000..397a5f8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx @@ -0,0 +1,191 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; + +_viv_uniform float input_fl_scale; +_viv_uniform float inOut_fl_scale; +_viv_uniform float output_fl_scale; + +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4; +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8; + + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16( + image2d_array_t input, + image2d_t bias, + image2d_t scale, + image2d_t meanVari, + image2d_array_t output, + float eps, + int is2D, + float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_short8 src0; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16_2D( + image2d_array_t input, + image2d_t bias, + image2d_t scale, + image2d_t meanVari, + image2d_array_t output, + float eps, + int is2D, + float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_short8 src0; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1; + vxc_short8 outval; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + vxc_half8 dst; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16( + image2d_array_t input, + image2d_t bias, + image2d_t scale, + image2d_t meanVari, + image2d_array_t output, + float eps, + int is2D, + float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_short8 src0, src2; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toInt16_2x8); + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16_2D( + image2d_array_t input, + image2d_t bias, + image2d_t scale, + image2d_t meanVari, + image2d_array_t output, + float eps, + int is2D, + float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_short8 src0, src2; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Fst_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertInt16Fp32Secd_4x4); + vxc_float4 norm; + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toInt16_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx new file mode 100644 index 0000000..350e425 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx @@ -0,0 +1,186 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform float input_fl_scale; + +_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4; +_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4; + +_viv_uniform float inOut_fl_scale; +_viv_uniform float output_fl_scale; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_char16 src0; + vxc_short8 outval; + vxc_half8 dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); + + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_char16 src0; + vxc_short8 outval; + vxc_half8 dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_char16 src0, src2; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_char16 src0, src2; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + float alpha = inOut_fl_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; + + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx index af20584..c08a996 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx @@ -243,6 +243,87 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_ float alpha = scale_inOut * scale_vari; bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_uchar16 src0, src2; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + float alpha = scale_inOut * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = tmpData0 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData1 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + norm = tmpData2 * alpha + bias_val; + tmpVal0 = convert_int4_rte(norm); + norm = tmpData3 * alpha + bias_val; + tmpVal1 = convert_int4_rte(norm); + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_uchar16 src0, src2; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_int4 tmpVal0, tmpVal1; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + float alpha = scale_inOut * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx index 3c1b892..a1f4ce0 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx @@ -112,3 +112,96 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16 _viv_asm(COPY, outval, dst, 16); VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidy = get_global_id(1); + int gidz = get_global_id(2); + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); + vxc_uchar16 src0; + vxc_short8 outval; + vxc_half8 dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); + coord.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); +} + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16_2D( + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) +{ + int gidz = get_global_id(1); + int2 coord = (int2)(get_global_id(0), gidz); + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); + vxc_uchar16 src0; + vxc_short8 outval; + vxc_half8 dst; + float scale_vari, bias_val; + vxc_float4 bias_f, scale_f; + + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); + bias_f = read_imagef(bias, coord_para.xy); + scale_f = read_imagef(scale, coord_para.xy); + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + scale_vari = scale_f.s0 * mean_vari.s1; + short zp = inputZP; + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; + half4 tmpVal0, tmpVal1; + float alpha = input_scale * scale_vari; + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); + + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); + norm = alpha * tmpData0 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData1 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 8; + norm = alpha * tmpData2 + bias_val; + _viv_asm(CONV, tmpVal0, norm); + norm = alpha * tmpData3 + bias_val; + _viv_asm(CONV, tmpVal1, norm); + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); + _viv_asm(COPY, outval, dst, 16); + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx new file mode 100644 index 0000000..c1266fc --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx @@ -0,0 +1,126 @@ +#include "cl_viv_vx_ext.h" + +#define logE (1.44269502f) +#define twoLogE (logE * 2.0f) + +float4 sigmoid_func(float4 x) +{ + x *= -logE; + x = 1.0f + exp2(x); + return 1.0f / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tanh_func(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1.0f / x; + return 2 * x - 1; +} + +_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4; +_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4; +_viv_uniform VXC_512Bits uniConvertF16_0_4x4; +_viv_uniform VXC_512Bits uniConvertF16_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +#define GRUCELL_F16_F16TOF16(act_name, act_func) \ +__kernel void grucell_activation_z_h_F16_F16toF16_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out \ + ) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \ + vxc_half8 src0, src1, src2, src3, src4, src5, src6; \ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, v2, 16); \ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, v4, 16); \ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src5, v5, 16); \ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src6, v6, 16); \ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, v3, 16); \ + \ + float4 h; \ + VXC_DP4x4(h, src2, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + float4 z; \ + VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + z = act_func(z); \ + h = tanh_func(h); \ + float4 h_tm; \ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + float4 result = (1 - z) * h + z * h_tm; \ + half4 dst0; \ + _viv_asm(CONV_RTE, dst0, result); \ + vxc_half4 dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + vxc_short4 dst; \ + _viv_asm(COPY, dst, dst1, 8); \ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func) + +_viv_uniform float hstate_in_scale; +_viv_uniform float hstate_in_tail; +_viv_uniform float output_scale; +_viv_uniform float output_zp; +#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \ +__kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out \ + ) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + src0_type src3; \ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \ + vxc_half8 src0, src1, src2, src4, src5, src6; \ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, v2, 16); \ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, v4, 16); \ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src5, v5, 16); \ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src6, v6, 16); \ + VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 h; \ + VXC_DP4x4(h, src2, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + float4 z; \ + VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + z = act_func(z); \ + h = tanh_func(h); \ + float4 h_tm; \ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + h_tm = h_tm * hstate_in_scale + hstate_in_tail; \ + float4 result = (1 - z) * h + z * h_tm; \ + result = result * output_scale + output_zp; \ + int4 dst0; \ + _viv_asm(CONV_RTE, dst0, result); \ + dst_type dst; \ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8) +GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8) +GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx new file mode 100644 index 0000000..a9c8d44 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx @@ -0,0 +1,96 @@ +#include "cl_viv_vx_ext.h" + +#define logE (1.44269502f) +#define twoLogE (logE * 2.0f) + +float4 sigmoid_func(float4 x) +{ + x *= -logE; + x = 1.0f + exp2(x); + return 1.0f / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} + +_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4; +_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4; +_viv_uniform VXC_512Bits uniConvertF16_0_4x4; +_viv_uniform VXC_512Bits uniConvertF16_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +#define GRUCELL_F16_F16TOF16(act_name, act_func) \ +__kernel void grucell_h_times_activation_r_F16_F16toF16_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t hstate_r_conv, \ + __write_only image2d_t output \ + ) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \ + vxc_half8 src0, src1, src2, src3, src4, src5, src6; \ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, v0, 16); \ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, v1, 16); \ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, v3, 16); \ + \ + float4 r; \ + VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + r = act_func(r); \ + float4 h_tm; \ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + float4 result = r * h_tm; \ + half4 dst0; \ + _viv_asm(CONV_RTE, dst0, result); \ + vxc_half4 dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + vxc_short4 dst; \ + _viv_asm(COPY, dst, dst1, 8); \ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func) + +_viv_uniform float hstate_in_scale; +_viv_uniform float hstate_in_tail; +#define GRUCELL_QNT_F16TO_F16(name0, act_name, act_func, src0_type) \ +__kernel void grucell_h_times_activation_r_##name0##_F16toF16_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t hstate_r_conv, \ + __write_only image2d_t output \ + ) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + src0_type src3; \ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \ + vxc_half8 src0, src1, src2, src4, src5, src6; \ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, v0, 16); \ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, v1, 16); \ + VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 r; \ + VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + r = act_func(r); \ + float4 h_tm; \ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + h_tm = h_tm * hstate_in_scale + hstate_in_tail; \ + float4 result = r * h_tm; \ + half4 dst0; \ + _viv_asm(CONV_RTE, dst0, result); \ + vxc_half8 dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + vxc_short4 dst; \ + _viv_asm(COPY, dst, dst1, 8); \ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_QNT_F16TO_F16(U8, SIGMOID, sigmoid_func, vxc_uchar8) +GRUCELL_QNT_F16TO_F16(I8, SIGMOID, sigmoid_func, vxc_char8) +GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx new file mode 100644 index 0000000..77fdcc9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx @@ -0,0 +1,148 @@ +#include "cl_viv_vx_ext.h" + +#define logE (1.44269502f) +#define twoLogE (logE * 2.0f) + +float4 sigmoid_func(float4 x) +{ + x *= -logE; + x = 1.0f + exp2(x); + return 1.0f / x; +} +float4 hard_sigmoid(float4 x) +{ + x = 0.2 * x + 0.5; + x = clamp(x, 0, 1); + return x; +} +float4 tanh_func(float4 x) +{ + x *= -twoLogE; + x = 1 + exp2(x); + x = 1.0f / x; + return 2 * x - 1; +} + +_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4; +_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4; +_viv_uniform VXC_512Bits uniConvertF16_0_4x4; +_viv_uniform VXC_512Bits uniConvertF16_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +#define GRUCELL_F16_F16TOF16(act_name, act_func) \ +__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_r_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out \ + ) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \ + vxc_half8 src0, src1, src2, src3, src4, src5, src6; \ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, v0, 16); \ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, v1, 16); \ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, v2, 16); \ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, v4, 16); \ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src5, v5, 16); \ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src6, v6, 16); \ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src3, v3, 16); \ + \ + float4 r; \ + VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + r = act_func(r); \ + float4 h0, h1; \ + VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + float4 h = h0 + r * h1; \ + float4 z; \ + VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + z = act_func(z); \ + h = tanh_func(h); \ + float4 h_tm; \ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + float4 result = (1 - z) * h + z * h_tm; \ + half4 dst0; \ + _viv_asm(CONV_RTE, dst0, result); \ + vxc_half4 dst1; \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + vxc_short4 dst; \ + _viv_asm(COPY, dst, dst1, 8); \ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func) + +_viv_uniform float hstate_in_scale; +_viv_uniform float hstate_in_tail; +_viv_uniform float output_scale; +_viv_uniform float output_zp; +#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \ +__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \ + __read_only image2d_t hstate_in, \ + __read_only image2d_t input_z_conv, \ + __read_only image2d_t input_r_conv, \ + __read_only image2d_t input_h_conv, \ + __read_only image2d_t hstate_z_conv, \ + __read_only image2d_t hstate_r_conv, \ + __read_only image2d_t hstate_h_conv, \ + __write_only image2d_t output, \ + __write_only image2d_t hstate_out \ + ) \ +{ \ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \ + src0_type src3; \ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \ + vxc_half8 src0, src1, src2, src4, src5, src6; \ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, v0, 16); \ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src1, v1, 16); \ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src2, v2, 16); \ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src4, v4, 16); \ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src5, v5, 16); \ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src6, v6, 16); \ + VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 r; \ + VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + r = act_func(r); \ + float4 h0, h1; \ + VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + float4 h = h0 + r * h1; \ + float4 z; \ + VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \ + z = act_func(z); \ + h = tanh_func(h); \ + float4 h_tm; \ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \ + h_tm = h_tm * hstate_in_scale + hstate_in_tail; \ + float4 result = (1 - z) * h + z * h_tm; \ + result = result * output_scale + output_zp; \ + int4 dst0; \ + _viv_asm(CONV_RTE, dst0, result); \ + dst_type dst; \ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8) +GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8) +GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx index a90f1ff..c358585 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx @@ -254,9 +254,9 @@ L2NORMSCALE_AXIS0_2D(I8, F16, I8, char, vxc_char8, vxc_char8, r_inputScale, __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ void l2normalizescale_axis0_U8_##in1_name##to##out_name##_2D \ (\ - __read_only image2d_array_t input,\ - __read_only image2d_array_t scale,\ - __write_only image2d_array_t output,\ + __read_only image2d_t input,\ + __read_only image2d_t scale,\ + __write_only image2d_t output,\ int axis\ )\ { \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx new file mode 100644 index 0000000..3396163 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx @@ -0,0 +1,69 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8; +_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8; +_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4; +_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4; +_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4; + +__kernel void pre_process_gray_4over3_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_uchar16 src0, src1, src2, src3; + + VXC_ReadImage(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_in.xy = (coord_in.xy >> 2) * 3; + coord_in.zw = coord_in.yy + (int2)(1, 2); + + vxc_uchar16 dst0, dst1, dst2; + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8); + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); + + VXC_WriteImage(output, coord_in.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_in.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_in.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_gray_half_U8toU8 + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float mean, + float f32Var + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_uchar16 src0; + + VXC_ReadImage(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_in.xy = coord_in.xy >> 1; + + VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc.vx new file mode 100644 index 0000000..f0b3417 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc.vx @@ -0,0 +1,204 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8; +_viv_uniform VXC_512Bits uniResize_x2_nhwc2_1_4x8; +_viv_uniform int out_height; + +__kernel void resize_bilinear_nhwc_U8toU8_2x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), 0); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), 0); + coord_in.x = ((coord_out.x * 2 - 1) >> 2) - 1; + coord_in.y = ((coord_out.y * 2 - 1) >> 2); + coord_in.x = coord_out.x == 0 ? -2 : coord_in.x; + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y; + + vxc_uchar16 in0, in1, in2, in3, result; + + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8); + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8); + VXC_WriteImage(output, coord_out, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8); + VXC_DP4x8(result, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8); + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(result, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8); + VXC_DP4x8(result, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8); + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + coord_out.y++; + VXC_DP4x8(result, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8); + VXC_DP4x8(result, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8); + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l00_2x8; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l01_2x8; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l02_2x8; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l03_2x8; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l10_4x4; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l11_4x4; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l12_4x4; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l13_4x4; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l14_4x4; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l15_4x4; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l16_4x4; +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l17_4x4; +__kernel void resize_bilinear_nhwc_U8toU8_3x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0); + coord_in.x = (short)(coord_out.x - 1) / (short)6 * 2; + coord_in.x = coord_out.x == 0 ? -2 : coord_in.x; + coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6; + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y; + + vxc_uchar16 in0, in1, in2, in3, dst0, dst1, dst2; + + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_out.zw = coord_out.xy + (int2)(16, 1); + + VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4); + VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4); + VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4); + VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4); + VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4); + VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4); + VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4); + VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4); + + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l00_2x8); + VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l01_2x8); + VXC_DP2x8(dst1, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l02_2x8); + VXC_DP2x8(dst1, in1, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l03_2x8); + VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0)); + coord_out.yw += 2; + + VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4); + VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4); + VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4); + VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4); + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4); + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4); + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4); + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4); + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0)); + + VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4); + VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4); + VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4); + VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4); + VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4); + VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4); + VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4); + VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4); + VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0)); + coord_out.yw += 2; + + VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l00_2x8); + VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l01_2x8); + VXC_DP2x8(dst1, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l02_2x8); + VXC_DP2x8(dst1, in2, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l03_2x8); + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0)); + + VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4); + VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4); + VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4); + VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4); + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4); + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4); + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4); + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4); + VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l00_4x8; +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l01_4x8; +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l10_4x8; +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l11_4x8; +__kernel void resize_bilinear_nhwc_U8toU8_4x_upsample_half_pixel_centers + ( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int align_corners, + int half_pixel_centers + ) +{ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), -1, 0, 0); + coord_in.x = ((coord_out.x - 3) >> 3) * 2; + coord_in.y = (coord_out.y * 2 - 3) >> 3; + coord_in.x = coord_out.x == 0 ? -2 : coord_in.x; + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y; + + vxc_uchar16 in0, in1, in2, in3, dst0, dst1; + + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_out.zw = coord_out.yy + (int2)(1, 2); + + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8); + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8); + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + + coord_out.yz = coord_out.yz + (int2)(3, 3); + + VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8); + VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8); + VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8); + VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8); + VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xy, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + + coord_out.yw = coord_out.yw + (int2)(3, 3); + + VXC_DP4x8(dst0, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8); + VXC_DP4x8(dst0, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8); + VXC_DP4x8(dst1, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8); + VXC_DP4x8(dst1, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8); + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + + coord_out.zw = coord_out.zw + (int2)(3, 3); + + VXC_DP4x8(dst0, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8); + VXC_DP4x8(dst0, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8); + VXC_DP4x8(dst1, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8); + VXC_DP4x8(dst1, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8); + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx index 8532ae0..13cee71 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx @@ -60,6 +60,63 @@ __kernel void scatter_nd_update_F16F16toF16( VXC_WriteImage(output, coord, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); } +__kernel void scatter_nd_update_F16F16toU8( + __read_only image2d_t input0, + __read_only image2d_t input1, + __read_only image2d_t input2, + image2d_array_t output, + int width, + int area, + int vol, + int coord_dim + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int cnt = 0; + + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_half8 sum; + _viv_asm(COPY, sum, tmpVal, 16); + Image img1 = create_image_from_image2d(input1, 4); + __global int* index_ptr = (__global int*)img1.ptr; + for(int i = 0; i < index_num; i++) + { + //int4 indice = read_imagei(input1, (int2)(0, i)); + int4 indice = vload4(0, index_ptr + offset_idx); + index_ptr += coord_dim; + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; + if(gidy == idx) + { + vxc_half8 src; + VXC_ReadImage(tmpVal, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + cnt++; + _viv_asm(COPY, src, tmpVal, 16); + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); + } + } + int2 coord = (int2)(gidx, gidy); + vxc_ushort8 ms0; + vxc_uchar8 dst; + if(cnt == 0) + { + vxc_half8 src; + VXC_ReadImage(tmpVal, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, ms0, multAndoutZP0, 16); + _viv_asm(COPY, src, tmpVal, 16); + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniU8MulAndPostShift_0_Lo_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } + else + { + _viv_asm(COPY, ms0, multAndoutZP1, 16); + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniU8MulAndPostShift_1_Lo_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + #define SCATTER_ND_UPDATE_QINT(src0_type_name, src2_type_name, out_type_name, data_type) \ __kernel void scatter_nd_update_##src0_type_name##src2_type_name##to##out_type_name##( \ __read_only image2d_t input0, \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx index 122fddb..5a0c5ce 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx @@ -11,6 +11,11 @@ _viv_uniform int offsetZ; _viv_uniform int offsetW; _viv_uniform int offset_idx; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift_1_Lo_2x8; +_viv_uniform int2 multAndoutZP0; +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp + __kernel void scatter_nd_update_F16F16toF16_big( __read_only image2d_t input0, __read_only image2d_t input1, @@ -62,3 +67,67 @@ __kernel void scatter_nd_update_F16F16toF16_big( } output_ptr[loc] = dst; } + +__kernel void scatter_nd_update_F16F16toU8_big( + __read_only image2d_t input0, + __read_only image2d_t input1, + __read_only image2d_t input2, + image2d_t output, + int width, + int area, + int vol, + int coord_dim + ) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int cnt = 0; + + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + vxc_half8 sum; + _viv_asm(COPY, sum, tmpVal, 16); + Image img1 = create_image_from_image2d(input1, 4); + Image img2 = create_image_from_image2d(input2, 2); + Image img3 = create_image_from_image2d(output, 1); + + __global int* index_ptr = (__global int*)img1.ptr; + __global short* update_ptr = (__global short*)img2.ptr; + __global uchar* output_ptr = (__global uchar*)img3.ptr; + for(int i = 0; i < index_num; i++) + { + int4 indice = vload4(0, index_ptr + offset_idx); + index_ptr += coord_dim; + + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; + if(gidy == idx) + { + vxc_half8 src; + short tmpData = update_ptr[i * update_width + gidx]; + cnt++; + _viv_asm(COPY, src, tmpData, 4); + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8); + } + } + short dst; + vxc_ushort8 ms0; + int loc = gidy * output_width+ gidx; + if(cnt == 0) + { + vxc_half8 src; + Image img0 = create_image_from_image2d(input0, 2); + __global short* ref_ptr = (__global short*)img0.ptr; + short tmpData = ref_ptr[loc]; + _viv_asm(COPY, ms0, multAndoutZP0, 16); + _viv_asm(COPY, src, tmpData, 4); + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), + uniU8MulAndPostShift_0_Lo_2x8); + output_ptr[loc] = dst; + } + else + { + _viv_asm(COPY, ms0, multAndoutZP1, 16); + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), + uniU8MulAndPostShift_1_Lo_2x8); + output_ptr[loc] = dst; + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx index 7fd4c58..70c303b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx @@ -46,11 +46,15 @@ __kernel void tile_remain##name2##_##name0##to##name1( \ { \ coord_out.x = coord.x + x * width; \ if (isLastItem) \ + { \ VXC_WriteImage2DArray(output, coord_out, src, \ VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \ + } \ else \ + { \ VXC_WriteImage2DArray(output, coord_out, src, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ } \ } \ } \ @@ -103,9 +107,13 @@ __kernel void tile_remain##name2##_##name0##to##name1##_2D( \ do \ { \ if (isLastItem) \ + { \ VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \ + } \ else \ + { \ VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ coord.x += width; \ } while (coord.x < output_width); \ coord.x = get_global_id(0); \ @@ -165,6 +173,3 @@ __kernel void tile_1toN_##name0##to##name1##_2D( \ } TILE_2D_1TON(U8, U8, vxc_uchar8) TILE_2D_1TON(I16, I16, vxc_short8) - - - diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx index b23c1cd..dba960c 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx @@ -57,11 +57,15 @@ __kernel void tile_remain##name2##_##name0##to##name1( \ { \ coord_out.x = coord.x + x * width; \ if (isLastItem) \ + { \ VXC_WriteImage2DArray(output, coord_out, dst, \ VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \ + } \ else \ + { \ VXC_WriteImage2DArray(output, coord_out, dst, \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ } \ } \ } \ @@ -114,9 +118,13 @@ __kernel void tile_remain##name2##_##name0##to##name1##_2D( \ do \ { \ if (isLastItem) \ + { \ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \ + } \ else \ + { \ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ coord.x += width; \ } while (coord.x < output_width); \ coord.x = get_global_id(0); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx index 99ac9fb..f6ccacc 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx @@ -64,13 +64,13 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride { int8 desc; int2 strides; + _viv_asm(COPY, desc, input, sizeof(desc)); #if (USE_40BITS_VA==0) strides.x = desc.s1; strides.y = desc.s4; #else _viv_asm(GET_IMAGE_STRIDE, strides, input); #endif - _viv_asm(COPY, desc, input, sizeof(desc)); uint address = as_uint(desc.s0); Tensor t = diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 06624a5..324dade 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -4040,6 +4040,7 @@ __kernel void detect_post_box_U8_U8toF32(\n\ static const char eltwise_unary_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float alpha;\n\ +_viv_uniform float beta;\n\ \n\ float4 eltwise_unary_sin(float4 x)\n\ {\n\ @@ -4077,7 +4078,7 @@ float4 eltwise_unary_neg(float4 x)\n\ \n\ float4 eltwise_unary_hard_sigmoid(float4 x)\n\ {\n\ - x = 0.2 * x + 0.5;\n\ + x = alpha * x + beta;\n\ x = clamp(x, 0, 1);\n\ return x;\n\ }\n\ @@ -4175,7 +4176,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int type, \\\n\ - float _alpha \\\n\ + float _alpha, \\\n\ + float _beta \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -4324,7 +4326,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int type, \\\n\ - float _alpha \\\n\ + float _alpha, \\\n\ + float _beta \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -4372,6 +4375,7 @@ ELTSISE_UNARY_BF16_2D(hard_gelu)\n\ static const char eltwise_unary_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float alpha;\n\ +_viv_uniform float beta;\n\ \n\ float4 eltwise_unary_sin(float4 x)\n\ {\n\ @@ -4409,7 +4413,7 @@ float4 eltwise_unary_neg(float4 x)\n\ \n\ float4 eltwise_unary_hard_sigmoid(float4 x)\n\ {\n\ - x = 0.2 * x + 0.5;\n\ + x = alpha * x + beta;\n\ x = clamp(x, 0, 1);\n\ return x;\n\ }\n\ @@ -4507,7 +4511,8 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int type, \\\n\ - float _alpha \\\n\ + float _alpha, \\\n\ + float _beta \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -4655,7 +4660,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ __read_only image2d_array_t input, \\\n\ __write_only image2d_array_t output, \\\n\ int type, \\\n\ - float _alpha \\\n\ + float _alpha, \\\n\ + float _beta \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -6733,6 +6739,182 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8 VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of group_normalization_f16_vx*/ +static const char group_normalization_f16_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ +\n\ +_viv_uniform float outputScale;\n\ +_viv_uniform int output_ZP;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ + float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ + float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = scale_vari * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = scale_vari * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ + float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_uchar16 outval;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + float alpha = outputScale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ + float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + vxc_half8 in_h;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_uchar16 outval;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + float alpha = outputScale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertEndInt16Fp32_4x4);\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of group_normalization_f16_scale_vx*/ + static const char group_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ @@ -7073,6 +7255,198 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI1 VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of group_normalization_i16_vx*/ +static const char group_normalization_i16_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +\n\ +_viv_uniform float input_fl_scale;\n\ +_viv_uniform float inOut_fl_scale;\n\ +_viv_uniform float output_fl_scale;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\ +\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16(\n\ + image2d_array_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D,\n\ + float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ +\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16_2D(\n\ + image2d_array_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D,\n\ + float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_short8 src0;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + vxc_short8 outval;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ + vxc_half8 dst;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16(\n\ + image2d_array_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D,\n\ + float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_short8 src0, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toInt16_2x8);\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16_2D(\n\ + image2d_array_t input,\n\ + image2d_t bias,\n\ + image2d_t scale,\n\ + image2d_t meanVari,\n\ + image2d_array_t output,\n\ + float eps,\n\ + int is2D,\n\ + float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_short8 src0, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Fst_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertInt16Fp32Secd_4x4);\n\ + vxc_float4 norm;\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toInt16_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}"; /* end of group_normalization_i16_scale_vx*/ + static const char group_normalization_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ @@ -7392,6 +7766,194 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_ }\n\ "; /* end of group_normalization_i8_vx*/ +static const char group_normalization_i8_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform float input_fl_scale;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\ +\n\ +_viv_uniform float inOut_fl_scale;\n\ +_viv_uniform float output_fl_scale;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_char16 src0;\n\ + vxc_short8 outval;\n\ + vxc_half8 dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ +\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_char16 src0;\n\ + vxc_short8 outval;\n\ + vxc_half8 dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_char16 src0, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_char16 src0, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + float alpha = inOut_fl_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of group_normalization_i8_scale_vx*/ + static const char group_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ @@ -7652,6 +8214,87 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_ tmpVal1 = convert_int4_rte(norm);\n\ VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_uchar16 src0, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_uchar16 src0, src2;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_int4 tmpVal0, tmpVal1;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + float alpha = scale_inOut * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = tmpData0 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData1 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + norm = tmpData2 * alpha + bias_val;\n\ + tmpVal0 = convert_int4_rte(norm);\n\ + norm = tmpData3 * alpha + bias_val;\n\ + tmpVal1 = convert_int4_rte(norm);\n\ + VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ + VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ }"; /* end of group_normalization_u8_vx*/ static const char group_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -7768,6 +8411,99 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16 _viv_asm(COPY, outval, dst, 16);\n\ VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidy = get_global_id(1);\n\ + int gidz = get_global_id(2);\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 outval;\n\ + vxc_half8 dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16_2D(\n\ + image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ + image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ +{\n\ + int gidz = get_global_id(1);\n\ + int2 coord = (int2)(get_global_id(0), gidz);\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ + vxc_uchar16 src0;\n\ + vxc_short8 outval;\n\ + vxc_half8 dst;\n\ + float scale_vari, bias_val;\n\ + vxc_float4 bias_f, scale_f;\n\ +\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ + bias_f = read_imagef(bias, coord_para.xy);\n\ + scale_f = read_imagef(scale, coord_para.xy);\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + scale_vari = scale_f.s0 * mean_vari.s1;\n\ + short zp = inputZP;\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ + half4 tmpVal0, tmpVal1;\n\ + float alpha = input_scale * scale_vari;\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ +\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ + VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ + norm = alpha * tmpData0 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData1 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 8;\n\ + norm = alpha * tmpData2 + bias_val;\n\ + _viv_asm(CONV, tmpVal0, norm);\n\ + norm = alpha * tmpData3 + bias_val;\n\ + _viv_asm(CONV, tmpVal1, norm);\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, outval, dst, 16);\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ "; /* end of group_normalization_u8_f16_vx*/ static const char grucell_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -7943,6 +8679,134 @@ __kernel void grucell_activation_sma_F16_F16_F16toF16_2D\n\ \n\ "; /* end of grucell_activation_sma_vx*/ +static const char grucell_activation_z_h_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (logE * 2.0f)\n\ +\n\ +float4 sigmoid_func(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1.0f + exp2(x);\n\ + return 1.0f / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tanh_func(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1.0f / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;\n\ +_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertF16_0_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertF16_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +#define GRUCELL_F16_F16TOF16(act_name, act_func) \\\n\ +__kernel void grucell_activation_z_h_F16_F16toF16_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\ + vxc_half8 src0, src1, src2, src3, src4, src5, src6; \\\n\ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, v2, 16); \\\n\ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, v4, 16); \\\n\ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src5, v5, 16); \\\n\ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src6, v6, 16); \\\n\ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, v3, 16); \\\n\ + \\\n\ + float4 h; \\\n\ + VXC_DP4x4(h, src2, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + float4 z; \\\n\ + VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + z = act_func(z); \\\n\ + h = tanh_func(h); \\\n\ + float4 h_tm; \\\n\ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + float4 result = (1 - z) * h + z * h_tm; \\\n\ + half4 dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, result); \\\n\ + vxc_half4 dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + vxc_short4 dst; \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\ +\n\ +_viv_uniform float hstate_in_scale;\n\ +_viv_uniform float hstate_in_tail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \\\n\ +__kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src0_type src3; \\\n\ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\ + vxc_half8 src0, src1, src2, src4, src5, src6; \\\n\ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, v2, 16); \\\n\ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, v4, 16); \\\n\ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src5, v5, 16); \\\n\ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src6, v6, 16); \\\n\ + VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 h; \\\n\ + VXC_DP4x4(h, src2, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + float4 z; \\\n\ + VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + z = act_func(z); \\\n\ + h = tanh_func(h); \\\n\ + float4 h_tm; \\\n\ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\ + float4 result = (1 - z) * h + z * h_tm; \\\n\ + result = result * output_scale + output_zp; \\\n\ + int4 dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, result); \\\n\ + dst_type dst; \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8)\n\ +GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\ +"; /* end of grucell_activation_z_h_vx*/ + static const char grucell_cdnn_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ #define logE (1.44269502f)\n\ @@ -8733,6 +9597,254 @@ __kernel void grucell_activation_cdnn_U8_U8_U8_to_U8\n\ \n\ "; /* end of grucell_cdnn_activation_u8_vx*/ +static const char grucell_h_times_activation_r_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (logE * 2.0f)\n\ +\n\ +float4 sigmoid_func(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1.0f + exp2(x);\n\ + return 1.0f / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;\n\ +_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertF16_0_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertF16_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +#define GRUCELL_F16_F16TOF16(act_name, act_func) \\\n\ +__kernel void grucell_h_times_activation_r_F16_F16toF16_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __write_only image2d_t output \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\ + vxc_half8 src0, src1, src2, src3, src4, src5, src6; \\\n\ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, v0, 16); \\\n\ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, v1, 16); \\\n\ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, v3, 16); \\\n\ + \\\n\ + float4 r; \\\n\ + VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + r = act_func(r); \\\n\ + float4 h_tm; \\\n\ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + float4 result = r * h_tm; \\\n\ + half4 dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, result); \\\n\ + vxc_half4 dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + vxc_short4 dst; \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\ +\n\ +_viv_uniform float hstate_in_scale;\n\ +_viv_uniform float hstate_in_tail;\n\ +#define GRUCELL_QNT_F16TO_F16(name0, act_name, act_func, src0_type) \\\n\ +__kernel void grucell_h_times_activation_r_##name0##_F16toF16_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __write_only image2d_t output \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src0_type src3; \\\n\ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\ + vxc_half8 src0, src1, src2, src4, src5, src6; \\\n\ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, v0, 16); \\\n\ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, v1, 16); \\\n\ + VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 r; \\\n\ + VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + r = act_func(r); \\\n\ + float4 h_tm; \\\n\ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\ + float4 result = r * h_tm; \\\n\ + half4 dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, result); \\\n\ + vxc_half8 dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + vxc_short4 dst; \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_QNT_F16TO_F16(U8, SIGMOID, sigmoid_func, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_F16(I8, SIGMOID, sigmoid_func, vxc_char8)\n\ +GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)\n\ +"; /* end of grucell_h_times_activation_r_vx*/ + +static const char grucell_reset_after_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +#define logE (1.44269502f)\n\ +#define twoLogE (logE * 2.0f)\n\ +\n\ +float4 sigmoid_func(float4 x)\n\ +{\n\ + x *= -logE;\n\ + x = 1.0f + exp2(x);\n\ + return 1.0f / x;\n\ +}\n\ +float4 hard_sigmoid(float4 x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float4 tanh_func(float4 x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1.0f / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;\n\ +_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertF16_0_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertF16_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +#define GRUCELL_F16_F16TOF16(act_name, act_func) \\\n\ +__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\ + vxc_half8 src0, src1, src2, src3, src4, src5, src6; \\\n\ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, v0, 16); \\\n\ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, v1, 16); \\\n\ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, v2, 16); \\\n\ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, v4, 16); \\\n\ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src5, v5, 16); \\\n\ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src6, v6, 16); \\\n\ + VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src3, v3, 16); \\\n\ + \\\n\ + float4 r; \\\n\ + VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + r = act_func(r); \\\n\ + float4 h0, h1; \\\n\ + VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + float4 h = h0 + r * h1; \\\n\ + float4 z; \\\n\ + VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + z = act_func(z); \\\n\ + h = tanh_func(h); \\\n\ + float4 h_tm; \\\n\ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + float4 result = (1 - z) * h + z * h_tm; \\\n\ + half4 dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, result); \\\n\ + vxc_half4 dst1; \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + vxc_short4 dst; \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\ +\n\ +_viv_uniform float hstate_in_scale;\n\ +_viv_uniform float hstate_in_tail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \\\n\ +__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + src0_type src3; \\\n\ + vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\ + vxc_half8 src0, src1, src2, src4, src5, src6; \\\n\ + VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, v0, 16); \\\n\ + VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src1, v1, 16); \\\n\ + VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src2, v2, 16); \\\n\ + VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src4, v4, 16); \\\n\ + VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src5, v5, 16); \\\n\ + VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src6, v6, 16); \\\n\ + VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 r; \\\n\ + VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + r = act_func(r); \\\n\ + float4 h0, h1; \\\n\ + VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + float4 h = h0 + r * h1; \\\n\ + float4 z; \\\n\ + VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\ + z = act_func(z); \\\n\ + h = tanh_func(h); \\\n\ + float4 h_tm; \\\n\ + VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\ + h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\ + float4 result = (1 - z) * h + z * h_tm; \\\n\ + result = result * output_scale + output_zp; \\\n\ + int4 dst0; \\\n\ + _viv_asm(CONV_RTE, dst0, result); \\\n\ + dst_type dst; \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8)\n\ +GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\ +"; /* end of grucell_reset_after_activation_vx*/ + static const char hswish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform float inputScale;\n\ @@ -11305,9 +12417,9 @@ L2NORMSCALE_AXIS0_2D(I8, F16, I8, char, vxc_char8, vxc_char8, r_inputScale, __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ void l2normalizescale_axis0_U8_##in1_name##to##out_name##_2D \\\n\ (\\\n\ - __read_only image2d_array_t input,\\\n\ - __read_only image2d_array_t scale,\\\n\ - __write_only image2d_array_t output,\\\n\ + __read_only image2d_t input,\\\n\ + __read_only image2d_t scale,\\\n\ + __write_only image2d_t output,\\\n\ int axis\\\n\ )\\\n\ { \\\n\ @@ -28714,6 +29826,77 @@ __kernel void pre_process_gray_scale_U8to##dst_type_name \\\n\ PRE_PROCESS_GRAY_SCALE_8BITS(U8, vxc_uchar16)\n\ PRE_PROCESS_GRAY_SCALE_8BITS(I8, vxc_char16)"; /* end of pre_process_gray_vx*/ +static const char pre_process_gray_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\ +\n\ +__kernel void pre_process_gray_4over3_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_uchar16 src0, src1, src2, src3;\n\ +\n\ + VXC_ReadImage(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.xy = (coord_in.xy >> 2) * 3;\n\ + coord_in.zw = coord_in.yy + (int2)(1, 2);\n\ +\n\ + vxc_uchar16 dst0, dst1, dst2;\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ +\n\ + VXC_WriteImage(output, coord_in.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_in.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_in.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_gray_half_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float mean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ +\n\ + vxc_uchar16 src0;\n\ +\n\ + VXC_ReadImage(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.xy = coord_in.xy >> 1;\n\ +\n\ + VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of pre_process_gray_2_vx*/ + static const char pre_process_gray_copy_vx[] = "/*\n\ ============================================================================\n\ Name : GrayScale.vx\n\ @@ -37915,6 +39098,211 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\ \n\ #endif"; /* end of resize_bilinear_U8_opt_vx*/ +static const char resize_bilinear_nhwc_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8;\n\ +_viv_uniform VXC_512Bits uniResize_x2_nhwc2_1_4x8;\n\ +_viv_uniform int out_height;\n\ +\n\ +__kernel void resize_bilinear_nhwc_U8toU8_2x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), 0);\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), 0);\n\ + coord_in.x = ((coord_out.x * 2 - 1) >> 2) - 1;\n\ + coord_in.y = ((coord_out.y * 2 - 1) >> 2);\n\ + coord_in.x = coord_out.x == 0 ? -2 : coord_in.x;\n\ + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;\n\ +\n\ + vxc_uchar16 in0, in1, in2, in3, result;\n\ +\n\ + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\ + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);\n\ + VXC_WriteImage(output, coord_out, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\ + VXC_DP4x8(result, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);\n\ + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(result, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\ + VXC_DP4x8(result, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);\n\ + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.y++;\n\ + VXC_DP4x8(result, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\ + VXC_DP4x8(result, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);\n\ + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l00_2x8;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l01_2x8;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l02_2x8;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l03_2x8;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l10_4x4;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l11_4x4;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l12_4x4;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l13_4x4;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l14_4x4;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l15_4x4;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l16_4x4;\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l17_4x4;\n\ +__kernel void resize_bilinear_nhwc_U8toU8_3x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + coord_in.x = (short)(coord_out.x - 1) / (short)6 * 2;\n\ + coord_in.x = coord_out.x == 0 ? -2 : coord_in.x;\n\ + coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;\n\ + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;\n\ +\n\ + vxc_uchar16 in0, in1, in2, in3, dst0, dst1, dst2;\n\ +\n\ + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.zw = coord_out.xy + (int2)(16, 1);\n\ +\n\ + VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\ + VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);\n\ + VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);\n\ + VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);\n\ + VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);\n\ + VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);\n\ + VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);\n\ + VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);\n\ +\n\ + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l00_2x8);\n\ + VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l01_2x8);\n\ + VXC_DP2x8(dst1, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l02_2x8);\n\ + VXC_DP2x8(dst1, in1, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l03_2x8);\n\ + VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.yw += 2;\n\ +\n\ + VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\ + VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);\n\ + VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);\n\ + VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);\n\ + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);\n\ + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);\n\ + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);\n\ + VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);\n\ + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\ + VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);\n\ + VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);\n\ + VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);\n\ + VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);\n\ + VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);\n\ + VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);\n\ + VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);\n\ + VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\ + coord_out.yw += 2;\n\ +\n\ + VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l00_2x8);\n\ + VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l01_2x8);\n\ + VXC_DP2x8(dst1, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l02_2x8);\n\ + VXC_DP2x8(dst1, in2, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l03_2x8);\n\ + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\ + VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);\n\ + VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);\n\ + VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);\n\ + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);\n\ + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);\n\ + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);\n\ + VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);\n\ + VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l00_4x8;\n\ +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l01_4x8;\n\ +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l10_4x8;\n\ +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l11_4x8;\n\ +__kernel void resize_bilinear_nhwc_U8toU8_4x_upsample_half_pixel_centers\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\ + int4 coord_in = (int4)(get_global_id(0), -1, 0, 0);\n\ + coord_in.x = ((coord_out.x - 3) >> 3) * 2;\n\ + coord_in.y = (coord_out.y * 2 - 3) >> 3;\n\ + coord_in.x = coord_out.x == 0 ? -2 : coord_in.x;\n\ + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;\n\ +\n\ + vxc_uchar16 in0, in1, in2, in3, dst0, dst1;\n\ +\n\ + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.zw = coord_out.yy + (int2)(1, 2);\n\ +\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);\n\ + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.yz = coord_out.yz + (int2)(3, 3);\n\ +\n\ + VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\ + VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);\n\ + VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\ + VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);\n\ + VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xy, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.yw = coord_out.yw + (int2)(3, 3);\n\ +\n\ + VXC_DP4x8(dst0, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\ + VXC_DP4x8(dst0, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);\n\ + VXC_DP4x8(dst1, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\ + VXC_DP4x8(dst1, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);\n\ + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.zw = coord_out.zw + (int2)(3, 3);\n\ +\n\ + VXC_DP4x8(dst0, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\ + VXC_DP4x8(dst0, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);\n\ + VXC_DP4x8(dst1, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\ + VXC_DP4x8(dst1, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);\n\ + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}"; /* end of resize_bilinear_nhwc_vx*/ + static const char resize_nearest_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\ @@ -38502,6 +39890,63 @@ __kernel void scatter_nd_update_F16F16toF16(\n\ VXC_WriteImage(output, coord, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ +__kernel void scatter_nd_update_F16F16toU8(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __read_only image2d_t input2,\n\ + image2d_array_t output,\n\ + int width,\n\ + int area,\n\ + int vol,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int cnt = 0;\n\ +\n\ + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_half8 sum;\n\ + _viv_asm(COPY, sum, tmpVal, 16);\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + //int4 indice = read_imagei(input1, (int2)(0, i));\n\ + int4 indice = vload4(0, index_ptr + offset_idx);\n\ + index_ptr += coord_dim;\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;\n\ + if(gidy == idx)\n\ + {\n\ + vxc_half8 src;\n\ + VXC_ReadImage(tmpVal, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + cnt++;\n\ + _viv_asm(COPY, src, tmpVal, 16);\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);\n\ + }\n\ + }\n\ + int2 coord = (int2)(gidx, gidy);\n\ + vxc_ushort8 ms0;\n\ + vxc_uchar8 dst;\n\ + if(cnt == 0)\n\ + {\n\ + vxc_half8 src;\n\ + VXC_ReadImage(tmpVal, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, src, tmpVal, 16);\n\ + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ + else\n\ + {\n\ + _viv_asm(COPY, ms0, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniU8MulAndPostShift_1_Lo_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ #define SCATTER_ND_UPDATE_QINT(src0_type_name, src2_type_name, out_type_name, data_type) \\\n\ __kernel void scatter_nd_update_##src0_type_name##src2_type_name##to##out_type_name##( \\\n\ __read_only image2d_t input0, \\\n\ @@ -38878,6 +40323,11 @@ _viv_uniform int offsetZ;\n\ _viv_uniform int offsetW;\n\ _viv_uniform int offset_idx;\n\ \n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_1_Lo_2x8;\n\ +_viv_uniform int2 multAndoutZP0;\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ +\n\ __kernel void scatter_nd_update_F16F16toF16_big(\n\ __read_only image2d_t input0,\n\ __read_only image2d_t input1,\n\ @@ -38929,6 +40379,70 @@ __kernel void scatter_nd_update_F16F16toF16_big(\n\ }\n\ output_ptr[loc] = dst;\n\ }\n\ +\n\ +__kernel void scatter_nd_update_F16F16toU8_big(\n\ + __read_only image2d_t input0,\n\ + __read_only image2d_t input1,\n\ + __read_only image2d_t input2,\n\ + image2d_t output,\n\ + int width,\n\ + int area,\n\ + int vol,\n\ + int coord_dim\n\ + )\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int cnt = 0;\n\ +\n\ + vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + vxc_half8 sum;\n\ + _viv_asm(COPY, sum, tmpVal, 16);\n\ + Image img1 = create_image_from_image2d(input1, 4);\n\ + Image img2 = create_image_from_image2d(input2, 2);\n\ + Image img3 = create_image_from_image2d(output, 1);\n\ +\n\ + __global int* index_ptr = (__global int*)img1.ptr;\n\ + __global short* update_ptr = (__global short*)img2.ptr;\n\ + __global uchar* output_ptr = (__global uchar*)img3.ptr;\n\ + for(int i = 0; i < index_num; i++)\n\ + {\n\ + int4 indice = vload4(0, index_ptr + offset_idx);\n\ + index_ptr += coord_dim;\n\ +\n\ + int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;\n\ + if(gidy == idx)\n\ + {\n\ + vxc_half8 src;\n\ + short tmpData = update_ptr[i * update_width + gidx];\n\ + cnt++;\n\ + _viv_asm(COPY, src, tmpData, 4);\n\ + VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);\n\ + }\n\ + }\n\ + short dst;\n\ + vxc_ushort8 ms0;\n\ + int loc = gidy * output_width+ gidx;\n\ + if(cnt == 0)\n\ + {\n\ + vxc_half8 src;\n\ + Image img0 = create_image_from_image2d(input0, 2);\n\ + __global short* ref_ptr = (__global short*)img0.ptr;\n\ + short tmpData = ref_ptr[loc];\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\ + _viv_asm(COPY, src, tmpData, 4);\n\ + VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1),\n\ + uniU8MulAndPostShift_0_Lo_2x8);\n\ + output_ptr[loc] = dst;\n\ + }\n\ + else\n\ + {\n\ + _viv_asm(COPY, ms0, multAndoutZP1, 16);\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1),\n\ + uniU8MulAndPostShift_1_Lo_2x8);\n\ + output_ptr[loc] = dst;\n\ + }\n\ +}\n\ "; /* end of scatter_nd_update_big_vx*/ static const char select_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -39898,11 +41412,15 @@ __kernel void tile_remain##name2##_##name0##to##name1( \\\n\ { \\\n\ coord_out.x = coord.x + x * width; \\\n\ if (isLastItem) \\\n\ + { \\\n\ VXC_WriteImage2DArray(output, coord_out, src, \\\n\ VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ else \\\n\ + { \\\n\ VXC_WriteImage2DArray(output, coord_out, src, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ } \\\n\ } \\\n\ } \\\n\ @@ -39955,9 +41473,13 @@ __kernel void tile_remain##name2##_##name0##to##name1##_2D( \\\n\ do \\\n\ { \\\n\ if (isLastItem) \\\n\ + { \\\n\ VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ else \\\n\ + { \\\n\ VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ coord.x += width; \\\n\ } while (coord.x < output_width); \\\n\ coord.x = get_global_id(0); \\\n\ @@ -40017,9 +41539,6 @@ __kernel void tile_1toN_##name0##to##name1##_2D( \\\n\ }\n\ TILE_2D_1TON(U8, U8, vxc_uchar8)\n\ TILE_2D_1TON(I16, I16, vxc_short8)\n\ -\n\ -\n\ -\n\ "; /* end of tile_vx*/ static const char tile_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -40081,11 +41600,15 @@ __kernel void tile_remain##name2##_##name0##to##name1( \\\n\ { \\\n\ coord_out.x = coord.x + x * width; \\\n\ if (isLastItem) \\\n\ + { \\\n\ VXC_WriteImage2DArray(output, coord_out, dst, \\\n\ VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ else \\\n\ + { \\\n\ VXC_WriteImage2DArray(output, coord_out, dst, \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ } \\\n\ } \\\n\ } \\\n\ @@ -40138,9 +41661,13 @@ __kernel void tile_remain##name2##_##name0##to##name1##_2D( \\\n\ do \\\n\ { \\\n\ if (isLastItem) \\\n\ + { \\\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ else \\\n\ + { \\\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ coord.x += width; \\\n\ } while (coord.x < output_width); \\\n\ coord.x = get_global_id(0); \\\n\ @@ -41444,13 +42971,13 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride {\n\ int8 desc;\n\ int2 strides;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ #if (USE_40BITS_VA==0)\n\ strides.x = desc.s1;\n\ strides.y = desc.s4;\n\ #else\n\ _viv_asm(GET_IMAGE_STRIDE, strides, input);\n\ #endif\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ uint address = as_uint(desc.s0);\n\ \n\ Tensor t =\n\ @@ -43341,13 +44868,14 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride {\n\ int8 desc;\n\ int2 strides;\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ +\n\ #if (USE_40BITS_VA==0)\n\ strides.x = desc.s1;\n\ strides.y = desc.s4;\n\ #else\n\ _viv_asm(GET_IMAGE_STRIDE, strides, input);\n\ #endif\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ uint address = as_uint(desc.s0);\n\ \n\ Tensor t =\n\ @@ -43386,15 +44914,14 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride } while(0)\n\ "; /* end of eltwise_ops_helper_cl*/ -static const char eltwise_unary_cl[] = "\n\ -float eltwise_unary_sin(float x, float alpha)\n\ +static const char eltwise_unary_cl[] = "float eltwise_unary_sin(float x, float alpha, float beta)\n\ {\n\ return native_sin(x);\n\ }\n\ \n\ #define logE (1.44269502f)\n\ #define twoLogE (logE * 2.0f)\n\ -float eltwise_unary_exp(float x, float alpha)\n\ +float eltwise_unary_exp(float x, float alpha, float beta)\n\ {\n\ x *= logE;\n\ x = exp2(x);\n\ @@ -43402,13 +44929,13 @@ float eltwise_unary_exp(float x, float alpha)\n\ }\n\ \n\ #define rlogE (0.693147182f)\n\ -float eltwise_unary_log(float x, float alpha)\n\ +float eltwise_unary_log(float x, float alpha, float beta)\n\ {\n\ x = log2(x);\n\ return x * rlogE;\n\ }\n\ \n\ -float eltwise_unary_elu(float val, float alpha)\n\ +float eltwise_unary_elu(float val, float alpha, float beta)\n\ {\n\ float x = val * logE;\n\ x = exp2(x) * alpha - alpha;\n\ @@ -43416,14 +44943,14 @@ float eltwise_unary_elu(float val, float alpha)\n\ return val < 0 ? x : val;\n\ }\n\ \n\ -float eltwise_unary_neg(float x, float alpha)\n\ +float eltwise_unary_neg(float x, float alpha, float beta)\n\ {\n\ return x * -1;\n\ }\n\ \n\ -float eltwise_unary_hard_sigmoid(float x, float alpha)\n\ +float eltwise_unary_hard_sigmoid(float x, float alpha, float beta)\n\ {\n\ - x = 0.2 * x + 0.5;\n\ + x = alpha * x + beta;\n\ x = clamp(x, 0, 1);\n\ return x;\n\ }\n\ @@ -43445,14 +44972,14 @@ float _tanh(float x, float alpha)\n\ return (2 * x - 1);\n\ }\n\ \n\ -float eltwise_unary_mish(float x, float alpha)\n\ +float eltwise_unary_mish(float x, float alpha, float beta)\n\ {\n\ float y = _softrelu(x, alpha);\n\ x = x * _tanh(y, alpha);\n\ return x;\n\ }\n\ \n\ -float eltwise_unary_round(float x, float alpha)\n\ +float eltwise_unary_round(float x, float alpha, float beta)\n\ {\n\ return convert_float(convert_int_rte(x));\n\ }\n\ @@ -43486,7 +45013,7 @@ float erf_eval(float x)\n\ return res * MUL2_RSQRTPI;\n\ }\n\ #define RSQRT2 (0.70710678118654752440084436210485f)\n\ -float eltwise_unary_gelu(float x, float alpha)\n\ +float eltwise_unary_gelu(float x, float alpha, float beta)\n\ {\n\ x = 0.5f * x * (1 + erf_eval(x * RSQRT2));\n\ \n\ @@ -43494,7 +45021,7 @@ float eltwise_unary_gelu(float x, float alpha)\n\ }\n\ \n\ #define SQRT_2_RCP_PI 0.7978845834732056f\n\ -float eltwise_unary_hard_gelu(float x, float alpha)\n\ +float eltwise_unary_hard_gelu(float x, float alpha, float beta)\n\ {\n\ float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *\n\ (x + 0.044715f * x * x * x), 0);\n\ @@ -43510,7 +45037,8 @@ __kernel void func_name##_F32toF32 \\\n\ float inputTail, \\\n\ float outputScale, \\\n\ float outputZP, \\\n\ - float alpha \\\n\ + float alpha, \\\n\ + float beta \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -43518,7 +45046,7 @@ __kernel void func_name##_F32toF32 \\\n\ float4 src = read_imagef(input, coord); \\\n\ \\\n\ float4 dst = 0; \\\n\ - dst.x = eltwise_unary_##func_name(src.x, alpha); \\\n\ + dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \\\n\ \\\n\ write_imagef(output, coord, dst.xxxx); \\\n\ }\n\ @@ -43542,7 +45070,8 @@ __kernel void func_name##_F32toF32_2D \\\n\ float inputTail, \\\n\ float outputScale, \\\n\ float outputZP, \\\n\ - float alpha \\\n\ + float alpha, \\\n\ + float beta \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -43550,7 +45079,7 @@ __kernel void func_name##_F32toF32_2D \\\n\ float4 src = read_imagef(input, coord); \\\n\ \\\n\ float4 dst = 0; \\\n\ - dst.x = eltwise_unary_##func_name(src.x, alpha); \\\n\ + dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \\\n\ \\\n\ write_imagef(output, coord, dst.xxxx); \\\n\ }\n\ @@ -43574,7 +45103,8 @@ __kernel void func_name##_U8toU8 \\\n\ float inputTail, \\\n\ float outputScale, \\\n\ float outputZP, \\\n\ - float alpha \\\n\ + float alpha, \\\n\ + float beta \\\n\ ) \\\n\ { \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ @@ -43582,7 +45112,7 @@ __kernel void func_name##_U8toU8 \\\n\ uint4 src = read_imageui(input, coord); \\\n\ float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ \\\n\ - data.x = eltwise_unary_##func_name(data.x, alpha); \\\n\ + data.x = eltwise_unary_##func_name(data.x, alpha, beta); \\\n\ uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ \\\n\ write_imageui(output, coord, dst); \\\n\ @@ -43607,7 +45137,8 @@ __kernel void func_name##_U8toU8_2D \\\n\ float inputTail, \\\n\ float outputScale, \\\n\ float outputZP, \\\n\ - float alpha \\\n\ + float alpha, \\\n\ + float beta \\\n\ ) \\\n\ { \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ @@ -43615,7 +45146,7 @@ __kernel void func_name##_U8toU8_2D \\\n\ uint4 src = read_imageui(input, coord); \\\n\ float4 data = convert_float4(src) * inputScale - inputTail; \\\n\ \\\n\ - data.x = eltwise_unary_##func_name(data.x, alpha); \\\n\ + data.x = eltwise_unary_##func_name(data.x, alpha, beta); \\\n\ uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\ \\\n\ write_imageui(output, coord, dst); \\\n\ @@ -43639,7 +45170,8 @@ __kernel void neg_I32toI32\n\ float inputTail,\n\ float outputScale,\n\ float outputZP,\n\ - float alpha\n\ + float alpha,\n\ + float beta\n\ )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ @@ -43658,7 +45190,8 @@ __kernel void neg_I32toI32_2D\n\ float inputTail,\n\ float outputScale,\n\ float outputZP,\n\ - float alpha\n\ + float alpha,\n\ + float beta\n\ )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ @@ -43854,7 +45387,10 @@ __kernel void floordiv_I32I32toU8(\n\ int4 src1;\n\ READ_IMAGEI_2DARRAY(src0, input, coord);\n\ READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ - uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);\n\ + float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + float4 out = floor(in0 / in1) * outputScale + outputTail;\n\ + uint4 dst = convert_uint4(out);\n\ write_imageui(output, coord, dst);\n\ }\n\ \n\ @@ -43872,7 +45408,10 @@ __kernel void floordiv_I32I32toU8_2D(\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ int4 src0 = read_imagei(input, coord);\n\ int4 src1 = read_imagei(input1, coord);\n\ - uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);\n\ + float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + float4 out = floor(in0 / in1) * outputScale + outputTail;\n\ + uint4 dst = convert_uint4(out);\n\ write_imageui(output, coord, dst);\n\ }\n\ \n\ @@ -45154,6 +46693,363 @@ static const char grucell_activation_sma_cl[] = "__kernel void grucell_activatio }\n\ "; /* end of grucell_activation_sma_cl*/ +static const char grucell_activation_z_h_cl[] = "#define logE (1.44269502f)\n\ +#define twoLogE (logE * 2.0f)\n\ +\n\ +float sigmoid(float x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float hard_sigmoid(float x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float tanh_func(float x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \\\n\ +__kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out, \\\n\ + float input_scale, float input_tail, float output_scale, float output_zp) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \\\n\ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\ + \\\n\ + h_tm = h_tm * input_scale + input_tail; \\\n\ + float4 h = h0 + h1; \\\n\ + float4 z = z0 + z1; \\\n\ + z.x = act_func(z.x); \\\n\ + h = tanh_func(h.x); \\\n\ + float4 dst = (1 - z ) * h + z * h_tm; \\\n\ + dst = dst * output_scale + output_zp; \\\n\ + uint4 result = convert_uint4_sat_rte(dst); \\\n\ + write_imageui(output, coord_in.xy, result); \\\n\ + write_imageui(hstate_out, coord_in.xy, result); \\\n\ +}\n\ +GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)\n\ +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \\\n\ +__kernel void grucell_activation_z_h_F32_F32toF32_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out, \\\n\ + float input_scale, float input_tail, float output_scale, float output_zp) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\ + float4 h_tm = read_imagef(hstate_in, coord_in.xy); \\\n\ + \\\n\ + float4 h = h0 + h1; \\\n\ + float4 z = z0 + z1; \\\n\ + z.x = act_func(z.x); \\\n\ + h = tanh_func(h.x); \\\n\ + float4 dst = (1 - z ) * h + z * h_tm; \\\n\ + write_imagef(output, coord_in.xy, dst); \\\n\ + write_imagef(hstate_out, coord_in.xy, dst); \\\n\ +}\n\ +\n\ +GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid)\n\ +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \\\n\ +__kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out, \\\n\ + float input_scale, float input_tail, float output_scale, float output_zp) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \\\n\ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\ + \\\n\ + h_tm = h_tm * input_scale + input_tail; \\\n\ + float4 h = h0 + h1; \\\n\ + float4 z = z0 + z1; \\\n\ + z.x = act_func(z.x); \\\n\ + h = tanh_func(h.x); \\\n\ + float4 dst = (1 - z ) * h + z * h_tm; \\\n\ + dst = dst * output_scale + output_zp; \\\n\ + int4 result = convert_int4_sat_rte(dst); \\\n\ + write_imagei(output, coord_in.xy, result); \\\n\ + write_imagei(hstate_out, coord_in.xy, result); \\\n\ +}\n\ +GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)\n\ +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_activation_z_h_cl*/ + +static const char grucell_h_times_activation_r_cl[] = "#define logE (1.44269502f)\n\ +#define twoLogE (logE * 2.0f)\n\ +\n\ +float sigmoid(float x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float hard_sigmoid(float x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +\n\ +#define GRUCELL_H_TIMES_R_U8_F32_F32(act_name, act_func) \\\n\ +__kernel void grucell_h_times_activation_r_U8_F32toF32_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __write_only image2d_t output, \\\n\ + float input_scale, float input_tail) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\ + float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \\\n\ + \\\n\ + float4 r = r0 + r1; \\\n\ + r.x = act_func(r.x); \\\n\ + h_tm = h_tm * input_scale + input_tail; \\\n\ + float4 r_times_h = r * h_tm; \\\n\ + write_imagef(output, coord_in.xy, r_times_h); \\\n\ +}\n\ +GRUCELL_H_TIMES_R_U8_F32_F32(SIGMOID, sigmoid)\n\ +//GRUCELL_H_TIMES_R_U8_F32_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define GRUCELL_H_TIMES_R_F32_F32_F32(act_name, act_func) \\\n\ +__kernel void grucell_h_times_activation_r_F32_F32toF32_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __write_only image2d_t output, \\\n\ + float input_scale, float input_tail) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\ + float4 h_tm = read_imagef(hstate_in, coord_in.xy); \\\n\ + \\\n\ + float4 r = r0 + r1; \\\n\ + r.x = act_func(r.x); \\\n\ + float4 r_times_h = r * h_tm; \\\n\ + write_imagef(output, coord_in.xy, r_times_h); \\\n\ +}\n\ +\n\ +GRUCELL_H_TIMES_R_F32_F32_F32(SIGMOID, sigmoid)\n\ +//GRUCELL_H_TIMES_R_F32_F32_F32(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define GRUCELL_H_TIMES_R_I32_F32_F32(act_name, act_func) \\\n\ +__kernel void grucell_h_times_activation_r_I32_F32toI32_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __write_only image2d_t output, \\\n\ + float input_scale, float input_tail) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\ + float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \\\n\ + \\\n\ + float4 r = r0 + r1; \\\n\ + r.x = act_func(r.x); \\\n\ + h_tm = h_tm * input_scale + input_tail; \\\n\ + float4 r_times_h = r * h_tm; \\\n\ + write_imagef(output, coord_in.xy, r_times_h); \\\n\ +}\n\ +GRUCELL_H_TIMES_R_I32_F32_F32(SIGMOID, sigmoid)\n\ +//GRUCELL_H_TIMES_R_I32_F32_F32(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_h_times_activation_r_cl*/ + +static const char grucell_reset_after_activation_cl[] = "#define logE (1.44269502f)\n\ +#define twoLogE (logE * 2.0f)\n\ +\n\ +float sigmoid(float x)\n\ +{\n\ + x *= -logE;\n\ + x = 1 + exp2(x);\n\ + return 1 / x;\n\ +}\n\ +float hard_sigmoid(float x)\n\ +{\n\ + x = 0.2 * x + 0.5;\n\ + x = clamp(x, 0, 1);\n\ + return x;\n\ +}\n\ +float tanh_func(float x)\n\ +{\n\ + x *= -twoLogE;\n\ + x = 1 + exp2(x);\n\ + x = 1 / x;\n\ + return 2 * x - 1;\n\ +}\n\ +\n\ +\n\ +#define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \\\n\ +__kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out, \\\n\ + float input_scale, float input_tail, float output_scale, float output_zp) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\ + float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \\\n\ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\ + \\\n\ + float4 r = r0 + r1; \\\n\ + r.x = act_func(r.x); \\\n\ + h_tm = h_tm * input_scale + input_tail; \\\n\ + float4 r_times_h = r * h1; \\\n\ + float4 h = h0 + r_times_h; \\\n\ + float4 z = z0 + z1; \\\n\ + z.x = act_func(z.x); \\\n\ + h = tanh_func(h.x); \\\n\ + float4 dst = (1 - z ) * h + z * h_tm; \\\n\ + dst = dst * output_scale + output_zp; \\\n\ + uint4 result = convert_uint4_sat_rte(dst); \\\n\ + write_imageui(output, coord_in.xy, result); \\\n\ + write_imageui(hstate_out, coord_in.xy, result); \\\n\ +}\n\ +GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)\n\ +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \\\n\ +__kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out, \\\n\ + float input_scale, float input_tail, float output_scale, float output_zp) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\ + float4 h_tm = read_imagef(hstate_in, coord_in.xy); \\\n\ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\ + \\\n\ + float4 r = r0 + r1; \\\n\ + r.x = act_func(r.x); \\\n\ + float4 r_times_h = r * h1; \\\n\ + float4 h = h0 + r_times_h; \\\n\ + float4 z = z0 + z1; \\\n\ + z.x = act_func(z.x); \\\n\ + h = tanh_func(h.x); \\\n\ + float4 dst = (1 - z ) * h + z * h_tm; \\\n\ + write_imagef(output, coord_in.xy, dst); \\\n\ + write_imagef(hstate_out, coord_in.xy, dst); \\\n\ +}\n\ +\n\ +GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid)\n\ +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\ +\n\ +#define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \\\n\ +__kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \\\n\ + __read_only image2d_t hstate_in, \\\n\ + __read_only image2d_t input_z_conv, \\\n\ + __read_only image2d_t input_r_conv, \\\n\ + __read_only image2d_t input_h_conv, \\\n\ + __read_only image2d_t hstate_z_conv, \\\n\ + __read_only image2d_t hstate_r_conv, \\\n\ + __read_only image2d_t hstate_h_conv, \\\n\ + __write_only image2d_t output, \\\n\ + __write_only image2d_t hstate_out, \\\n\ + float input_scale, float input_tail, float output_scale, float output_zp) \\\n\ +{ \\\n\ + int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + float4 src0, src1, src2, src3; \\\n\ + float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\ + float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\ + float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\ + float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\ + float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \\\n\ + float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\ + float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\ + float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\ + \\\n\ + float4 r = r0 + r1; \\\n\ + r.x = act_func(r.x); \\\n\ + h_tm = h_tm * input_scale + input_tail; \\\n\ + float4 r_times_h = r * h1; \\\n\ + float4 h = h0 + r_times_h; \\\n\ + float4 z = z0 + z1; \\\n\ + z.x = act_func(z.x); \\\n\ + h = tanh_func(h.x); \\\n\ + float4 dst = (1 - z ) * h + z * h_tm; \\\n\ + dst = dst * output_scale + output_zp; \\\n\ + int4 result = convert_int4_sat_rte(dst); \\\n\ + write_imagei(output, coord_in.xy, result); \\\n\ + write_imagei(hstate_out, coord_in.xy, result); \\\n\ +}\n\ +GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)\n\ +//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_reset_after_activation_cl*/ + static const char hswish_cl[] = "#define HSWISH_F32_F32_PROCESS() \\\n\ float4 src, tmp, dst; \\\n\ src = read_imagef(input, coord); \\\n\ @@ -55827,14 +57723,20 @@ static const source_map_t evis_resource[] = {"gather_nd_mix_vx", gather_nd_mix_vx}, {"get_matrix_vx", get_matrix_vx}, {"group_normalization_f16_vx", group_normalization_f16_vx}, + {"group_normalization_f16_scale_vx", group_normalization_f16_scale_vx}, {"group_normalization_i16_vx", group_normalization_i16_vx}, + {"group_normalization_i16_scale_vx", group_normalization_i16_scale_vx}, {"group_normalization_i8_vx", group_normalization_i8_vx}, + {"group_normalization_i8_scale_vx", group_normalization_i8_scale_vx}, {"group_normalization_u8_vx", group_normalization_u8_vx}, {"group_normalization_u8_f16_vx", group_normalization_u8_f16_vx}, {"grucell_activation_vx", grucell_activation_vx}, {"grucell_activation_sma_vx", grucell_activation_sma_vx}, + {"grucell_activation_z_h_vx", grucell_activation_z_h_vx}, {"grucell_cdnn_activation_vx", grucell_cdnn_activation_vx}, {"grucell_cdnn_activation_u8_vx", grucell_cdnn_activation_u8_vx}, + {"grucell_h_times_activation_r_vx", grucell_h_times_activation_r_vx}, + {"grucell_reset_after_activation_vx", grucell_reset_after_activation_vx}, {"hswish_vx", hswish_vx}, {"instance_normalization_f16_vx", instance_normalization_f16_vx}, {"instance_normalization_i16_vx", instance_normalization_i16_vx}, @@ -55923,6 +57825,7 @@ static const source_map_t evis_resource[] = {"pow_u8_vx", pow_u8_vx}, {"pre_process_bgra_vx", pre_process_bgra_vx}, {"pre_process_gray_vx", pre_process_gray_vx}, + {"pre_process_gray_2_vx", pre_process_gray_2_vx}, {"pre_process_gray_copy_vx", pre_process_gray_copy_vx}, {"pre_process_nv12_scale_vx", pre_process_nv12_scale_vx}, {"pre_process_nv12_scale_8bits_vx", pre_process_nv12_scale_8bits_vx}, @@ -55976,6 +57879,7 @@ static const source_map_t evis_resource[] = {"resize_bilinear_U8_vx", resize_bilinear_U8_vx}, {"resize_bilinear_U8_half_pixel_centers_vx", resize_bilinear_U8_half_pixel_centers_vx}, {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx}, + {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx}, {"resize_nearest_vx", resize_nearest_vx}, {"scatter_nd_vx", scatter_nd_vx}, {"scatter_nd_big_vx", scatter_nd_big_vx}, @@ -56027,6 +57931,9 @@ static const source_map_t cl_resource[] = {"group_normalization_u8_cl", group_normalization_u8_cl}, {"grucell_activation_cl", grucell_activation_cl}, {"grucell_activation_sma_cl", grucell_activation_sma_cl}, + {"grucell_activation_z_h_cl", grucell_activation_z_h_cl}, + {"grucell_h_times_activation_r_cl", grucell_h_times_activation_r_cl}, + {"grucell_reset_after_activation_cl", grucell_reset_after_activation_cl}, {"hswish_cl", hswish_cl}, {"instance_normalization_f16_cl", instance_normalization_f16_cl}, {"instance_normalization_f32_cl", instance_normalization_f32_cl}, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c index d4cf2ae..1ce386a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c @@ -33,6 +33,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" static vsi_status op_compute @@ -42,48 +43,15 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - vsi_size_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t dims = 0; - vx_tensor input = NULL, input0 = NULL; - vx_tensor output = NULL, output0 = NULL; - status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_t n; - if (inputs[0]->attr.dim_num > 4) + n = vsi_nn_kernel_selector( self->graph, "abs", inputs, 1, outputs, 1, NULL ); + if( n == NULL ) { - input_size[0] = (int32_t)vsi_nn_GetElementNum(inputs[0]) / - inputs[0]->attr.size[inputs[0]->attr.dim_num - 1]; - input_size[1] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 1]; - dims= 2; -#ifdef VSI_40BIT_VA_SUPPORT - input = vxReshapeTensor(inputs[0]->t, input_size, dims); - output = vxReshapeTensor(outputs[0]->t, input_size, dims); -#else - input = vxReshapeTensor(inputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims); - output = vxReshapeTensor(outputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims); -#endif - input0 = input; - output0 = output; - } - else - { - input0 = inputs[0]->t; - output0 = outputs[0]->t; + status = VSI_FAILURE; } - self->n = vxLeakyReluLayer( - self->graph->g, - input0, - -1, - output0 - ); - - if( NULL != self->n ) - { - status = VSI_SUCCESS; - } - if (input) vxReleaseTensor(&input); - if (output) vxReleaseTensor(&output); return status; } /* op_compute() */ @@ -152,4 +120,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c index 2e67b83..70ff65e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c @@ -301,8 +301,8 @@ static vsi_status op_optimize reshape 3d input (xcn) --> 4d input (whcn) reshape 3d output(xcn) --> 4d output(whcn) */ - shape[0] = 1; - shape[1] = inputs[0]->attr.size[0]; + shape[0] = inputs[0]->attr.size[0]; + shape[1] = 1; shape[2] = inputs[0]->attr.size[1]; shape[3] = inputs[0]->attr.size[2]; dim = 4; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c index 6a43126..5d16c2b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c @@ -183,7 +183,7 @@ static vsi_status op_init vsi_status status = VSI_SUCCESS; self->nn_param.clip.local2 = (vsi_nn_clip_lcl2_data *)malloc(sizeof(vsi_nn_clip_lcl2_data)); - if (NULL == self->nn_param.reduce.local2) + if (NULL == self->nn_param.clip.local2) { return VX_ERROR_NO_MEMORY; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c index 5c0b7ad..8c216ea 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c @@ -32,6 +32,7 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_dtype_util.h" +#include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" static vsi_status op_compute @@ -41,58 +42,31 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vx_tensor bias; - vsi_status status; - vx_nn_convolution_params_ext_t *p_ext = NULL; - vx_nn_convolution_params_ext2_t *p_ext2 = NULL; - vx_nn_convolution_params_ext2_t param_ext2; - memset( ¶m_ext2, 0, sizeof( vx_nn_convolution_params_ext2_t ) ); - p_ext2 = ¶m_ext2; - p_ext = &p_ext2->ext; - - status = VSI_FAILURE; - - //set ext relative parameters - p_ext->khr.padding_x = self->nn_param.conv2d.pad[0]; - p_ext->khr.padding_y = self->nn_param.conv2d.pad[2]; - if (self->nn_param.conv2d.dilation[0] > 0) - { - p_ext->khr.dilation_x = self->nn_param.conv2d.dilation[0] - 1; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "stride_w", self->nn_param.conv2d.stride[0] ); + vsi_nn_kernel_param_add_int32( param, "stride_h", self->nn_param.conv2d.stride[1] ); + vsi_nn_kernel_param_add_int32( param, "pad_h_front", self->nn_param.conv2d.pad[2] ); + vsi_nn_kernel_param_add_int32( param, "pad_h_end", self->nn_param.conv2d.pad[3] ); + vsi_nn_kernel_param_add_int32( param, "pad_w_front", self->nn_param.conv2d.pad[0] ); + vsi_nn_kernel_param_add_int32( param, "pad_w_end", self->nn_param.conv2d.pad[1] ); + vsi_nn_kernel_param_add_int32( param, "dilation_w", self->nn_param.conv2d.dilation[0] ); + vsi_nn_kernel_param_add_int32( param, "dilation_h", self->nn_param.conv2d.dilation[1] ); + vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy ); + vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy ); + vsi_nn_kernel_param_add_int32( param, + "down_scale_size_rounding", self->vx_param.down_scale_size_rounding ); + if (self->nn_param.conv2d.multiplier != 0) { + vsi_nn_kernel_param_add_int32( param, "multiplier", + self->nn_param.conv2d.multiplier ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "depthwise_conv2d", + inputs, 3, outputs, 1, param ); + } else { + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv2d", + inputs, 3, outputs, 1, param ); } - if (self->nn_param.conv2d.dilation[1] > 0) - { - p_ext->khr.dilation_y = self->nn_param.conv2d.dilation[1] - 1; - } - p_ext->khr.overflow_policy = self->vx_param.overflow_policy; - p_ext->khr.rounding_policy = self->vx_param.rounding_policy; - p_ext->khr.down_scale_size_rounding = self->vx_param.down_scale_size_rounding; - - p_ext->padding_x_right = self->nn_param.conv2d.pad[1]; - p_ext->padding_y_bottom = self->nn_param.conv2d.pad[3]; - - //set ext2 relative parameters - p_ext2->depth_multiplier = self->nn_param.conv2d.multiplier; - p_ext2->stride_x = self->nn_param.conv2d.stride[0]; - p_ext2->stride_y = self->nn_param.conv2d.stride[1]; - - if( inputs[2] == NULL ) - { - bias = NULL; - } - else - { - bias = inputs[2]->t; - } - - self->n = vxConvolutionLayer( - self->graph->g, - inputs[0]->t, - inputs[1]->t, - bias, - (vx_nn_convolution_params_t *)p_ext2, - sizeof( vx_nn_convolution_params_ext2_t ), - outputs[0]->t - ); + vsi_nn_kernel_param_release( ¶m ); if( NULL != self->n ) { @@ -306,6 +280,20 @@ static vsi_bool op_check IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) IO_TYPE(D_F32, D_BF16, D_F32, D_F32) + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I4|Q_DFP) + END_IO_TYPE_DECL(CONV2D) ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, self->input.num, outputs, self->output.num); if(!ret) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c index af34411..327b949 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c @@ -56,7 +56,7 @@ static vsi_nn_internal_tensor_t * reshape_cell_out output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); /* reshape cell_out [w,h,c,n] to [w,h,c,1,n] */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); reshape_cell_size = vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); reshape_cell_size[0] = cell_out->attr.size[0]; @@ -64,8 +64,8 @@ static vsi_nn_internal_tensor_t * reshape_cell_out reshape_cell_size[2] = cell_out->attr.size[2]; reshape_cell_size[3] = 1; reshape_cell_size[4] = cell_out->attr.size[3]; - curr->node->nn_param.reshape.size = reshape_cell_size; - curr->node->nn_param.reshape.dim_num = 5; + curr->node->nn_param.reshape2.size = reshape_cell_size; + curr->node->nn_param.reshape2.dim_num = 5; curr->inputs[0] = cell_out; curr->outputs[0] = output_tensor->t; @@ -90,15 +90,15 @@ static vsi_nn_internal_tensor_t * reshape_split_out output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); /* reshape [w,h,c,t,n] to [w,h,c,n] */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); reshape_split_size = vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); reshape_split_size[0] = split_out->attr.size[0]; reshape_split_size[1] = split_out->attr.size[1]; reshape_split_size[2] = split_out->attr.size[2]; reshape_split_size[3] = split_out->attr.size[4]; - curr->node->nn_param.reshape.size = reshape_split_size; - curr->node->nn_param.reshape.dim_num = 4; + curr->node->nn_param.reshape2.size = reshape_split_size; + curr->node->nn_param.reshape2.dim_num = 4; curr->inputs[0] = split_out; curr->outputs[0] = output_tensor->t; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c new file mode 100644 index 0000000..35bf275 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c @@ -0,0 +1,396 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_dtype_util.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _conv3d_local_data_t { + int32_t placeholder; +} conv3d_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + param = vsi_nn_kernel_param_create(); + +#define MAP_PARAM(type_name, value) {\ + vsi_nn_kernel_param_add_int32( param, type_name, value); \ + } + + MAP_PARAM("stride_w",self->nn_param.conv3d.stride[0]); + MAP_PARAM("stride_h",self->nn_param.conv3d.stride[1]); + MAP_PARAM("stride_d",self->nn_param.conv3d.stride[2]); + + MAP_PARAM("pad_left",self->nn_param.conv3d.pad[0]); + MAP_PARAM("pad_right",self->nn_param.conv3d.pad[1]); + MAP_PARAM("pad_top",self->nn_param.conv3d.pad[2]); + MAP_PARAM("pad_bottom",self->nn_param.conv3d.pad[3]); + MAP_PARAM("pad_front",self->nn_param.conv3d.pad[4]); + MAP_PARAM("pad_end",self->nn_param.conv3d.pad[5]); + + MAP_PARAM("depth_multiplier", self->nn_param.conv3d.multiplier); + MAP_PARAM("overflow_policy",self->vx_param.overflow_policy); + MAP_PARAM("rounding_policy",self->vx_param.rounding_policy); + MAP_PARAM("down_scale_size_rounding",self->vx_param.down_scale_size_rounding); + + if ( self->nn_param.conv3d.dilation[0] * + self->nn_param.conv3d.dilation[1] * + self->nn_param.conv3d.dilation[2] > 1) + { + VSILOGE("conv3d could not support dilation > 1\n"); + return status; + }else + { + MAP_PARAM("dilation_w",self->nn_param.conv3d.dilation[0]); + MAP_PARAM("dilation_h",self->nn_param.conv3d.dilation[1]); + MAP_PARAM("dilation_d",self->nn_param.conv3d.dilation[2]); + } +#undef MAP_PARAM + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv3d", + inputs, 3, outputs, 1, param ); + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + /* Check fl and scale*/ + ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); + + if(ret) { + /* check inputs outputs data type */ + BEGIN_IO_TYPE_DECL(CONV3D, 3, 1) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_ASYM, D_U8|Q_ASYM) + + IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) + IO_TYPE(D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F32, D_F32) + + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I8|Q_SYM) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + + /* IO_TYPE(INPUT, WEIGHT, NULL, OUTPUT) */ + IO_TYPE(D_F32, D_F32, D_NONE, D_F32) + + IO_TYPE(D_F16, D_F16, D_NONE, D_F16) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F16) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) + + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + + IO_TYPE(D_BF16, D_BF16, D_NONE, D_F32) + IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) + + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_NONE, D_I8|Q_SYM) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + + /* HW 9.0 */ + IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + END_IO_TYPE_DECL(CONV3D) + ret = VALIDATE_OP_IO_TYPES(CONV3D, self, inputs, self->input.num, outputs, self->output.num); + if(!ret) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + /* check parameters */ + if(inputs[1]->attr.size[0] * inputs[1]->attr.size[1] > 6400) { + VSILOGE("Kernel size should <= 6400."); + return FALSE; + } + } + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_conv3d_param *nn_param; + vsi_size_t i, pad[_cnt_of_array(self->nn_param.conv3d.pad)] = {0}; + for(i = 0; i < _cnt_of_array(self->nn_param.conv3d.pad); i++) + { + pad[i] = self->nn_param.conv3d.pad[i]; + } +#ifdef VX_CONVERT_POLICY_WRAP_ENABLE + if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) + { + self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE; + } +#endif + + nn_param = &self->nn_param.conv3d; + + vsi_nn_compute_padding_3d( + inputs[0]->attr.size, + inputs[1]->attr.size, + (uint32_t *)self->nn_param.conv3d.stride, + (uint32_t *)self->nn_param.conv3d.dilation, + self->nn_param.conv3d.pad_type, + pad + ); + for(i = 0; i < _cnt_of_array(self->nn_param.conv3d.pad); i++) + { + self->nn_param.conv3d.pad[i] = (uint32_t)pad[i]; + } + + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + inputs[1]->attr.size[0], + (vx_uint32 *)&nn_param->pad[0], + nn_param->stride[0], + nn_param->dilation[0], + VSI_NN_ROUND_FLOOR + ); + outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[1], + inputs[1]->attr.size[1], + (vx_uint32 *)&nn_param->pad[2], + nn_param->stride[1], + nn_param->dilation[1], + VSI_NN_ROUND_FLOOR + ); + outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[2], + inputs[1]->attr.size[2], + (vx_uint32 *)&nn_param->pad[4], + nn_param->stride[2], + nn_param->dilation[2], + VSI_NN_ROUND_FLOOR + ); + if(self->nn_param.conv3d.weights > 0) + { + outputs[0]->attr.size[3] = self->nn_param.conv3d.weights; + } + else if(self->nn_param.conv3d.multiplier > 0) + { + outputs[0]->attr.size[3] = inputs[0]->attr.size[3] * self->nn_param.conv3d.multiplier; + } + else + { + outputs[0]->attr.size[3] = inputs[1]->attr.size[4]; + } + outputs[0]->attr.size[4] = inputs[0]->attr.size[4]; + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_init + ( + vsi_nn_node_t* self + ) +{ + /* TODO + //self->nn_param.conv3d.local = \ + // (conv3d_local_data_t*)malloc(sizeof(conv3d_local_data_t)); + */ + + return VSI_SUCCESS; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + status = vsi_nn_op_common_deinit(self); + + /* TODO + //vsi_nn_safe_free(self->nn_param.conv3d.local); + */ + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CONV3D, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS \ No newline at end of file diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c index 6a6647f..85d35df 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c @@ -187,24 +187,53 @@ static vsi_status op_optimize p_opt = &opt; } - inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( - VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, - 4, #ifdef VSI_40BIT_VA_SUPPORT - inputs[0]->attr.size, - outputs[0]->attr.size, - outputs[0]->attr.size, + { + vx_size size_input0[VSI_NN_MAX_DIM_NUM]; + vx_size size_output0[VSI_NN_MAX_DIM_NUM]; + size_t i = 0; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_input0[i] = (vx_size)inputs[0]->attr.size[i]; + size_output0[i] = (vx_size)outputs[0]->attr.size[i]; + } + inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( + VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, + 4, + size_input0, + size_output0, + size_output0, + outputs[0]->attr.dtype.vx_type, + (vx_nn_convolution_relu_pooling_params_t *)&p, + sizeof(p), + p_opt, + inputs[1]->t, inputs[2]->t + ); + } #else - (vx_uint32*)inputs[0]->attr.size, - (vx_uint32*)outputs[0]->attr.size, - (vx_uint32*)outputs[0]->attr.size, + { + uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM]; + uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM]; + size_t i = 0; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i]; + size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i]; + } + inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( + VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, + 4, + size_u32_input0, + size_u32_output0, + size_u32_output0, + outputs[0]->attr.dtype.vx_type, + (vx_nn_convolution_relu_pooling_params_t *)&p, + sizeof(p), + p_opt, + inputs[1]->t, inputs[2]->t + ); + } #endif - outputs[0]->attr.dtype.vx_type, - (vx_nn_convolution_relu_pooling_params_t *)&p, - sizeof(p), - p_opt, - inputs[1]->t, inputs[2]->t - ); vsi_nn_DeinitConvReluPoolParameter( &p ); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c index f49ef3b..48b43b5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c @@ -215,18 +215,30 @@ static vsi_status op_optimize } #ifdef VSI_40BIT_VA_SUPPORT - inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( - VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, - 4, - inputs[0]->attr.size, - pconv_out->attr.size, - outputs[0]->attr.size, - outputs[0]->attr.dtype.vx_type, - (vx_nn_convolution_relu_pooling_params_t *)&p, - sizeof(p), - p_opt, - inputs[1]->t, inputs[2]->t - ); + { + vx_size size_input0[VSI_NN_MAX_DIM_NUM]; + vx_size size_pconv_out[VSI_NN_MAX_DIM_NUM]; + vx_size size_output0[VSI_NN_MAX_DIM_NUM]; + size_t i = 0; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_input0[i] = (vx_size)inputs[0]->attr.size[i]; + size_pconv_out[i] = (vx_size)pconv_out->attr.size[i]; + size_output0[i] = (vx_size)outputs[0]->attr.size[i]; + } + inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( + VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, + 4, + size_input0, + size_pconv_out, + size_output0, + outputs[0]->attr.dtype.vx_type, + (vx_nn_convolution_relu_pooling_params_t *)&p, + sizeof(p), + p_opt, + inputs[1]->t, inputs[2]->t + ); + } #else { uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index d91a7e6..fec61bb 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -104,16 +104,11 @@ static vsi_status op_optimize { if(NULL == inputs[0]->t && NULL != outputs[0]->t) { -#ifdef VSI_40BIT_VA_SUPPORT - inputs[0]->t = vxReshapeTensor(outputs[0]->t, - inputs[0]->attr.size, inputs[0]->attr.dim_num); -#else - inputs[0]->t = vxReshapeTensor(outputs[0]->t, - (vx_int32*)inputs[0]->attr.size, inputs[0]->attr.dim_num); -#endif + inputs[0]->t = vsi_nn_safe_reshape_tensor(outputs[0]->t, + (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, sizeof(inputs[0]->attr.size[0])); if( inputs[0]->t == NULL ) { - VSILOGE("Call vxReshapeTensor fail"); + VSILOGE("Call vsi_nn_safe_reshape_tensor fail"); return VSI_FAILURE; } self->nn_param.dataconvert.lcl_data->use_reshape = TRUE; @@ -123,16 +118,11 @@ static vsi_status op_optimize { if(NULL == outputs[0]->t && NULL != inputs[0]->t) { -#ifdef VSI_40BIT_VA_SUPPORT - outputs[0]->t = vxReshapeTensor(inputs[0]->t, - outputs[0]->attr.size, outputs[0]->attr.dim_num); -#else - outputs[0]->t = vxReshapeTensor(inputs[0]->t, - (vx_int32*)outputs[0]->attr.size, outputs[0]->attr.dim_num); -#endif + outputs[0]->t = vsi_nn_safe_reshape_tensor(inputs[0]->t, + (void*)outputs[0]->attr.size, (vsi_size_t)outputs[0]->attr.dim_num, sizeof(outputs[0]->attr.size[0])); if( outputs[0]->t == NULL ) { - VSILOGE("Call vxReshapeTensor fail"); + VSILOGE("Call vsi_nn_safe_reshape_tensor fail"); return VSI_FAILURE; } self->nn_param.dataconvert.lcl_data->use_reshape = TRUE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c index a82f521..2373688 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -53,6 +53,7 @@ static vsi_status _eltwise_op_compute vsi_bool ret = TRUE; vx_bool doShapeOptimized = TRUE; vsi_nn_kernel_param_t * param = NULL; + vsi_nn_context_t ctx = NULL; if( NULL == self ) { @@ -60,9 +61,13 @@ static vsi_status _eltwise_op_compute } status = VSI_FAILURE; + ctx = self->graph->ctx; + if ( strcmp(kernel_name, "sub") == 0 || strcmp(kernel_name, "add") == 0 - || strcmp(kernel_name, "mul") == 0 ) + || strcmp(kernel_name, "mul") == 0 + || (strcmp(kernel_name, "maximum") == 0 && ctx->config.support_stream_processor) + || (strcmp(kernel_name, "minimum") == 0 && ctx->config.support_stream_processor)) { doShapeOptimized = FALSE; @@ -184,7 +189,6 @@ vsi_bool vsi_nn_op_eltwise_setup return ret; } /* vsi_nn_op_eltwise_setup() */ - static vsi_bool op_check_minimum ( vsi_nn_node_t * self, @@ -322,7 +326,6 @@ static vsi_bool op_check_pow return TRUE; } /* op_check() */ - static vsi_bool op_check_add ( vsi_nn_node_t * self, @@ -457,9 +460,6 @@ static vsi_bool op_check_sub return ret; } /* op_check() */ - - - static vsi_bool op_check_div ( vsi_nn_node_t * self, @@ -518,7 +518,6 @@ static vsi_bool op_check_div return TRUE; } /* op_check() */ - static vsi_bool op_check_mul ( vsi_nn_node_t * self, @@ -657,7 +656,6 @@ DEF_ELEMENT_WISE_OP( DIVIDE, div ); DEF_ELEMENT_WISE_OP( MULTIPLY, mul ); DEF_ELEMENT_WISE_OP( POW, pow ); - #undef DEF_ELEMENT_WISE_OP #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c index b2a162f..d8ae9d9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -46,6 +46,7 @@ static vsi_status _eltwise_unary_op_compute { vsi_status status = VSI_FAILURE; float alpha = 0; + float beta = 0; vsi_nn_kernel_param_t * param = NULL; if( NULL == self ) @@ -54,8 +55,17 @@ static vsi_status _eltwise_unary_op_compute } param = vsi_nn_kernel_param_create(); - alpha = self->nn_param.elu.alpha; + if (strcmp(kernel_name, "elu") == 0) + { + alpha = self->nn_param.elu.alpha; + } + else + { + alpha = self->nn_param.hard_sigmoid.alpha; + beta = self->nn_param.hard_sigmoid.beta; + } vsi_nn_kernel_param_add_float32( param, "alpha", alpha ); + vsi_nn_kernel_param_add_float32( param, "beta", beta ); // TODO: This optimzie is a hack for gpu path, // it should be moved to gpu kernel setup. @@ -158,7 +168,8 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F16) END_IO_TYPE_DECL(ELTWISE_UNARY) - if(!VALIDATE_OP_IO_TYPES(ELTWISE_UNARY, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(ELTWISE_UNARY, self, inputs, self->input.num, outputs, self->output.num)) + { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -169,15 +180,22 @@ static vsi_bool op_check return TRUE; } /* op_check() */ -static vsi_status op_init +static vsi_status _eltwise_unary_op_init ( + const char * kernel_name, vsi_nn_node_t * self ) { - if (vsi_nn_compareVersion(self->graph, 1, 1, 29) == -1) + if (vsi_nn_compareVersion(self->graph, 1, 1, 29) == -1 && + strcmp(kernel_name, "elu") == 0) { self->nn_param.elu.alpha = 1; } + else if (strcmp(kernel_name, "hard_sigmoid") == 0) + { + self->nn_param.hard_sigmoid.alpha = 0.2f; + self->nn_param.hard_sigmoid.beta = 0.5f; + } return VSI_SUCCESS; } /* op_init() */ @@ -196,7 +214,15 @@ extern "C" { { \ return _eltwise_unary_op_compute( ""#kernel_name, self, inputs, outputs ); \ } \ -DEF_OP_REG(name, op_init, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 1, 1) + static vsi_status op_init_##kernel_name \ + ( \ + vsi_nn_node_t * self \ + ) \ + { \ + return _eltwise_unary_op_init( ""#kernel_name, self ); \ + } \ +DEF_OP_REG(name, op_init_##kernel_name, op_compute_##kernel_name, \ + vsi_nn_op_common_deinit, op_check, op_setup, NULL, 1, 1) DEF_ELEMENT_WISE_UNARY_OP( SIN, sin ); DEF_ELEMENT_WISE_UNARY_OP( EXP, exp ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c index e55e456..b3ae7cf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c @@ -32,6 +32,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_constraint_check.h" @@ -52,11 +53,7 @@ static void _reshape_tensor attr.size[2] = input->attr.size[1]; attr.dim_num = 3; } -#ifdef VSI_40BIT_VA_SUPPORT - *output = vxReshapeTensor( input->t, attr.size, attr.dim_num ); -#else - *output = vxReshapeTensor( input->t, (vx_int32*)attr.size, attr.dim_num ); -#endif + *output = vsi_nn_safe_reshape_tensor( input->t, (void*)attr.size, (vsi_size_t)attr.dim_num , sizeof(attr.size[0])); } static vsi_status op_compute diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c index e17292b..325e9c1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c @@ -98,23 +98,28 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(FLOORDIV, 2, 1) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) - IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16, D_I32, D_I32) END_IO_TYPE_DECL(FLOORDIV) - if(!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num)) + { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c index ab84c5a..812c7df 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c @@ -73,11 +73,7 @@ static vsi_status op_compute input_size[0] = num_fc; input_size[1] = num_no_fc; dims= 2; -#ifdef VSI_40BIT_VA_SUPPORT - input = vxReshapeTensor(inputs[0]->t, input_size, dims); -#else - input = vxReshapeTensor(inputs[0]->t, (vx_int32*)input_size, dims); -#endif + input = vsi_nn_safe_reshape_tensor(inputs[0]->t, (void*)input_size, (vsi_size_t)dims, sizeof(input_size[0])); weight = inputs[1]->t; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c index 0cdd29d..1bbb5ee 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c @@ -95,62 +95,25 @@ static vsi_status op_compute input_size[0] = num_fc; input_size[1] = num_no_fc; dims= 2; -#ifdef VSI_40BIT_VA_SUPPORT - input = vxReshapeTensor(inputs[0]->t, input_size, dims); -#else - { - int32_t input_size_32bit[VSI_NN_MAX_DIM_NUM] = {0}; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - input_size_32bit[i] = (int32_t)input_size[i]; - } - input = vxReshapeTensor(inputs[0]->t, input_size_32bit, (uint32_t)dims); - } -#endif + input = vsi_nn_safe_reshape_tensor(inputs[0]->t, (void*)input_size, (vsi_size_t)dims, sizeof(input_size[0])); weights_size[0] = num_fc; weights_size[1] = ofm; dims= 2; -#ifdef VSI_40BIT_VA_SUPPORT - weight = vxReshapeTensor(inputs[1]->t, weights_size, dims); -#else - { - int32_t weight_size_32bit[VSI_NN_MAX_DIM_NUM] = {0}; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - weight_size_32bit[i] = (int32_t)weight_size_32bit[i]; - } - weight = vxReshapeTensor(inputs[1]->t, weight_size_32bit, (uint32_t)dims); - } -#endif + weight = vsi_nn_safe_reshape_tensor(inputs[1]->t, (void*)weights_size, (vsi_size_t)dims, sizeof(weights_size[0])); if( inputs[2] != NULL ) { bias_size[0] = ofm; bias_size[1] = 1; dims= 2; -#ifdef VSI_40BIT_VA_SUPPORT - bias = vxReshapeTensor(inputs[2]->t, bias_size, dims); -#else - { - int32_t bias_size_32bit[VSI_NN_MAX_DIM_NUM] = {0}; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - bias_size_32bit[i] = (int32_t)bias_size[i]; - } - bias = vxReshapeTensor(inputs[2]->t, bias_size_32bit, (uint32_t)dims); - } -#endif + bias = vsi_nn_safe_reshape_tensor(inputs[2]->t, (void*)bias_size, (vsi_size_t)dims, sizeof(bias_size[0])); } output_size[0] = ofm; output_size[1] = num_no_fc; dims= 2; -#ifdef VSI_40BIT_VA_SUPPORT - output = vxReshapeTensor(outputs[0]->t, output_size, dims); -#else - output = vxReshapeTensor(outputs[0]->t, (vx_int32*)output_size, (uint32_t)dims); -#endif + output = vsi_nn_safe_reshape_tensor(outputs[0]->t, (void*)output_size, (vsi_size_t)dims, sizeof(output_size[0])); self->n = vxFullyConnectedLayer( self->graph->g, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c index 8766867..cf19eeb 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c @@ -267,23 +267,51 @@ static vsi_bool op_setup opt.num_of_output_dims = outputs[0]->attr.dim_num; p_opt = &opt; - inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3( - VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER, #ifdef VSI_40BIT_VA_SUPPORT - inputs[0]->attr.size, - outputs[0]->attr.size, - outputs[0]->attr.size, + { + vx_size size_input0[VSI_NN_MAX_DIM_NUM]; + vx_size size_output0[VSI_NN_MAX_DIM_NUM]; + size_t i = 0; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_input0[i] = (vx_size)inputs[0]->attr.size[i]; + size_output0[i] = (vx_size)outputs[0]->attr.size[i]; + } + inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3( + VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER, + size_input0, + size_output0, + size_output0, + &p, + sizeof(p), + (vx_weights_biases_parameter_optimizations_t *)p_opt, + sizeof(opt), + inputs[1]->t, inputs[2]->t + ); + } #else - (vx_uint32*)inputs[0]->attr.size, - (vx_uint32*)outputs[0]->attr.size, - (vx_uint32*)outputs[0]->attr.size, + { + uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM]; + uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM]; + size_t i = 0; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i]; + size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i]; + } + inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3( + VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER, + size_u32_input0, + size_u32_output0, + size_u32_output0, + &p, + sizeof(p), + (vx_weights_biases_parameter_optimizations_t *)p_opt, + sizeof(opt), + inputs[1]->t, inputs[2]->t + ); + } #endif - &p, - sizeof(p), - (vx_weights_biases_parameter_optimizations_t *)p_opt, - sizeof(opt), - inputs[1]->t, inputs[2]->t - ); if( p.pad_const ) { vxReleaseScalar( &p.pad_const ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c index 2776150..6cf086c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c @@ -54,17 +54,17 @@ static vsi_status op_compute vsi_size_t *input_size = inputs[0]->attr.size; vsi_size_t dims_num = inputs[0]->attr.dim_num; - if(inputs[1]->attr.dim_num > 1) + if (inputs[1]->attr.dim_num > 1) { coord_dim = inputs[1]->attr.size[0]; } - if( coord_dim > 3 ) + if (coord_dim > 4 || (coord_dim > 3 && input_size[dims_num - 1] != 1)) { CHECK_STATUS(status); return status; } - param =vsi_nn_kernel_param_create(); + param = vsi_nn_kernel_param_create(); for(i = 0; i < dims_num - coord_dim; ++i) { @@ -74,13 +74,13 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size ); vsi_nn_kernel_param_add_int32( param, "coord_dim", (int32_t)coord_dim ); n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param ); - if( n != NULL ) + if ( n != NULL ) { self->n = (vx_node)n; status = VSI_SUCCESS; } - if(param != NULL) + if (param != NULL) { vsi_nn_kernel_param_release( ¶m ); } @@ -110,7 +110,7 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I32, D_F16) END_IO_TYPE_DECL(GATHER_ND) - if(!VALIDATE_OP_IO_TYPES(GATHER_ND, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(GATHER_ND, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -131,10 +131,10 @@ static vsi_bool op_setup /* TODO: Add code to comput outputs' shape. */ vsi_size_t i = 0; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { vsi_size_t j = 0, coord_dim = 1; - if(inputs[1]->attr.dim_num > 1) + if (inputs[1]->attr.dim_num > 1) { coord_dim = inputs[1]->attr.size[0]; } @@ -147,7 +147,7 @@ static vsi_bool op_setup { outputs[0]->attr.size[j++] = inputs[1]->attr.size[i]; } - if(inputs[1]->attr.dim_num == 1) + if (inputs[1]->attr.dim_num == 1) { outputs[0]->attr.size[j++] = inputs[1]->attr.size[0]; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c index 61efd47..21e0a17 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c @@ -164,8 +164,8 @@ static vsi_status _op_optimize /* insert a reshape node before and after 3D group_norm */ - shape[0] = 1; - shape[1] = inputs[0]->attr.size[0]; + shape[0] = inputs[0]->attr.size[0]; + shape[1] = 1; shape[2] = inputs[0]->attr.size[1]; shape[3] = inputs[0]->attr.size[2]; dim = 4; @@ -203,17 +203,25 @@ static vsi_bool _op_check { BEGIN_IO_TYPE_DECL(GROUP_NORM, 3, 1) IO_TYPE(D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F16) IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_F32, D_F32, D_F16, D_F32) IO_TYPE(D_F32, D_F32, D_F32, D_F32) IO_TYPE(D_I32, D_F32, D_F16, D_I32) IO_TYPE(D_I32, D_F32, D_F16, D_F32) IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F16) IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_I8|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F16) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP) END_IO_TYPE_DECL(GROUP_NORM) if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c index b3aec6e..ad4c2a7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c @@ -337,6 +337,8 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { + vsi_nn_internal_deinit_node_wksp( self ); + return VSI_SUCCESS; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c index a007884..18ae554 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c @@ -24,7 +24,6 @@ #include #include - #include "vsi_nn_types.h" #include "vsi_nn_platform.h" #include "vsi_nn_log.h" @@ -73,7 +72,15 @@ static vsi_nn_internal_tensor_t * _create_fc } attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + if (input->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || + input->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16) + { + attr.dtype.vx_type = input->attr.dtype.vx_type; + } + else + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } attr.dim_num = VSI_NN_DIM_AUTO; attr.vtl = TRUE; attr.is_const = FALSE; @@ -91,110 +98,6 @@ static vsi_nn_internal_tensor_t * _create_fc return fc_out; } /* () */ -/* - copmute the recurrent hstate gates - equations: - reset_after == True: - ht = FC(hstate, kernel_rh, bias_rh) - ht = rt * ht - reset_after == False: - ht = rt * hstate - ht = FC(ht, kernel_rh, bias_rh) -*/ -static vsi_nn_internal_tensor_t * _compute_ht - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t * input_rt, - vsi_nn_tensor_t * hstate, - vsi_nn_tensor_t * weight, - vsi_nn_tensor_t * bias - ) -{ - vsi_bool use_virtual_tensor = TRUE; - vsi_nn_grucell_param * p = &self->nn_param.grucell; - vsi_nn_internal_tensor_t * tensor1 = NULL, * tensor2 = NULL; - - if(p->reset_after == TRUE) - { - tensor1 = _create_fc( - self, - hstate, - weight, - bias - ); - tensor2 = vsi_nn_rnn_create_binary_operator( - self, - VSI_NN_OP_MULTIPLY, - input_rt, - tensor1->t, - &input_rt->attr.dtype, - use_virtual_tensor - ); - } - else - { - tensor1 = vsi_nn_rnn_create_binary_operator( - self, - VSI_NN_OP_MULTIPLY, - input_rt, - hstate, - &input_rt->attr.dtype, - use_virtual_tensor - ); - tensor2 = _create_fc( - self, - tensor1->t, - weight, - bias - ); - } - - return tensor2; -} /* _compute_ht() */ - -/* - compute the recurrent update gates or reset gates - equations: - xt = FC(hstate, kernel_xt, bias_xt) - xt = input_xt + xt - xt = recurrent_activation(xt) -*/ -static vsi_nn_internal_tensor_t * _compute_recurrent_gate - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t * input_xt, - vsi_nn_tensor_t * hstate, - vsi_nn_tensor_t * weight, - vsi_nn_tensor_t * bias - ) -{ - vsi_bool use_virtual_tensor = TRUE; - vsi_nn_grucell_param * p = &self->nn_param.grucell; - vsi_nn_internal_tensor_t * tensor_add = NULL, * tensor_act; - vsi_nn_internal_tensor_t * recurrent_fc_out = NULL; - - recurrent_fc_out = _create_fc(self, hstate, weight, bias); - - tensor_add = vsi_nn_rnn_create_binary_operator( - self, - VSI_NN_OP_ADD, - recurrent_fc_out->t, - input_xt, - &recurrent_fc_out->t->attr.dtype, - use_virtual_tensor - ); - - tensor_act = vsi_nn_rnn_create_activation( - self, - tensor_add->t, - p->recurrent_activation, - &tensor_add->t->attr.dtype, - use_virtual_tensor - ); - - return tensor_act; -} /* _compute_recurrent_gate */ - static vsi_bool setup_op_shapes ( vsi_nn_node_t * self, @@ -251,6 +154,8 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { + vsi_nn_internal_deinit_node_wksp( self ); + return VSI_SUCCESS; } @@ -265,7 +170,8 @@ static vsi_status op_optimize return vsi_nn_internal_optimize_node( self, direction ); } -static vsi_bool op_setup +#if 1 +static vsi_bool op_setup_default ( vsi_nn_node_t * self, vsi_nn_tensor_t ** inputs, @@ -276,7 +182,9 @@ static vsi_bool op_setup vsi_nn_internal_node_t * curr = NULL; vsi_nn_grucell_param * p = &self->nn_param.grucell; vsi_nn_internal_tensor_t * input_fc_outputs[GRUCELL_GATE_CNT] = { NULL }; - vsi_nn_internal_tensor_t * zt = NULL, * rt = NULL, * ht = NULL; + vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL }; + vsi_nn_internal_tensor_t * h_times_r = NULL; + vsi_nn_tensor_attr_t attr; vsi_nn_internal_init_node_wksp( self ); @@ -294,42 +202,136 @@ static vsi_bool op_setup ); } - /* compute update gate and reset gate */ - zt = _compute_recurrent_gate( - self, - input_fc_outputs[GRUCELL_GATES_Z]->t, - inputs[GRUCELL_IN_H_STATE], - inputs[GRUCELL_IN_KERNEL_R2Z], - inputs[GRUCELL_IN_BIAS_R2Z] - ); - rt = _compute_recurrent_gate( - self, - input_fc_outputs[GRUCELL_GATES_R]->t, - inputs[GRUCELL_IN_H_STATE], - inputs[GRUCELL_IN_KERNEL_R2R], - inputs[GRUCELL_IN_BIAS_R2R] - ); + /* create hstate fc */ + for(i = 0; i < GRUCELL_GATE_CNT - 1; i++) + { + hstate_fc_outputs[i] = _create_fc( + self, + inputs[GRUCELL_IN_H_STATE], + inputs[GRUCELL_IN_KERNEL_R2Z + i], + inputs[GRUCELL_IN_BIAS_R2Z + i] + ); + } - /* compute recurrent h with parameter 'reset_after' */ - ht = _compute_ht( + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 || + self->graph->ctx->config.support_stream_processor) + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + } + else + { + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + } + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + h_times_r = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R, 3, 1 ); + curr->node->nn_param.grucell_h_times_activation_r.recurrent_activation = p->recurrent_activation; + curr->inputs[0] = inputs[GRUCELL_IN_H_STATE]; + curr->inputs[1] = input_fc_outputs[GRUCELL_GATES_R]->t; + curr->inputs[2] = hstate_fc_outputs[GRUCELL_GATES_R]->t; + curr->outputs[0] = h_times_r->t; + vsi_nn_internal_setup_node(self, curr); + + hstate_fc_outputs[GRUCELL_GATES_H] = _create_fc( self, - rt->t, - inputs[GRUCELL_IN_H_STATE], + h_times_r->t, inputs[GRUCELL_IN_KERNEL_R2H], inputs[GRUCELL_IN_BIAS_R2H] ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_Z_H, 0, 0 ); + curr->node->nn_param.grucell_activation_z_h.activation = p->activation; + curr->node->nn_param.grucell_activation_z_h.recurrent_activation = p->recurrent_activation; + curr->inputs[GRUCELL_ACT_Z_H_HSTATE] = inputs[GRUCELL_IN_H_STATE]; + curr->inputs[GRUCELL_ACT_Z_H_I_FC_Z] = input_fc_outputs[GRUCELL_GATES_Z]->t; + curr->inputs[GRUCELL_ACT_Z_H_I_FC_H] = input_fc_outputs[GRUCELL_GATES_H]->t; + curr->inputs[GRUCELL_ACT_Z_H_H_FC_Z] = hstate_fc_outputs[GRUCELL_GATES_Z]->t; + curr->inputs[GRUCELL_ACT_Z_H_H_FC_H] = hstate_fc_outputs[GRUCELL_GATES_H]->t; + curr->outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT] = outputs[GRUCELL_OUT_OUTPUT]; + curr->outputs[GRUCELL_ACT_Z_H_OUT_HSTATE] = outputs[GRUCELL_OUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + + return TRUE; +} +#endif + +static vsi_bool op_setup_reset_after + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + vsi_nn_internal_node_t * curr = NULL; + vsi_nn_grucell_param * p = &self->nn_param.grucell; + vsi_nn_internal_tensor_t * input_fc_outputs[GRUCELL_GATE_CNT] = { NULL }; + vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL }; + + vsi_nn_internal_init_node_wksp( self ); + + /* compute output tensor's shapes */ + setup_op_shapes(self, inputs, outputs); + + /* create input fc */ + for(i = 0; i < GRUCELL_GATE_CNT; i++) + { + input_fc_outputs[i] = _create_fc( + self, + inputs[GRUCELL_IN_INPUT], + inputs[GRUCELL_IN_KERNEL_I2Z + i], + inputs[GRUCELL_IN_BIAS_I2Z + i] + ); + } + + /* create hstate fc */ + for(i = 0; i < GRUCELL_GATE_CNT; i++) + { + hstate_fc_outputs[i] = _create_fc( + self, + inputs[GRUCELL_IN_H_STATE], + inputs[GRUCELL_IN_KERNEL_R2Z + i], + inputs[GRUCELL_IN_BIAS_R2Z + i] + ); + } + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION, 0, 0 ); curr->node->nn_param.grucell_activation.activation = p->activation; - curr->inputs[GRUCELL_ACT_IN_H_STATE] = inputs[GRUCELL_IN_H_STATE]; - curr->inputs[GRUCELL_ACT_IN_INPUT_FC_H] = input_fc_outputs[GRUCELL_GATES_H]->t; - curr->inputs[GRUCELL_ACT_IN_H_T] = ht->t; - curr->inputs[GRUCELL_ACT_IN_Z_T] = zt->t; + curr->node->nn_param.grucell_activation.recurrent_activation = p->recurrent_activation; + curr->inputs[GRUCELL_ACT_H_STATE] = inputs[GRUCELL_IN_H_STATE]; + curr->inputs[GRUCELL_ACT_I_FC_Z] = input_fc_outputs[GRUCELL_GATES_Z]->t; + curr->inputs[GRUCELL_ACT_I_FC_R] = input_fc_outputs[GRUCELL_GATES_R]->t; + curr->inputs[GRUCELL_ACT_I_FC_H] = input_fc_outputs[GRUCELL_GATES_H]->t; + curr->inputs[GRUCELL_ACT_H_FC_Z] = hstate_fc_outputs[GRUCELL_GATES_Z]->t; + curr->inputs[GRUCELL_ACT_H_FC_R] = hstate_fc_outputs[GRUCELL_GATES_R]->t; + curr->inputs[GRUCELL_ACT_H_FC_H] = hstate_fc_outputs[GRUCELL_GATES_H]->t; curr->outputs[GRUCELL_ACT_OUT_OUTPUT] = outputs[GRUCELL_OUT_OUTPUT]; curr->outputs[GRUCELL_ACT_OUT_H_STATE] = outputs[GRUCELL_OUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); return TRUE; +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if (self->nn_param.grucell.reset_after == TRUE) + { + return op_setup_reset_after(self, inputs, outputs); + } + else + { + return op_setup_default(self, inputs, outputs); + } } /* op_setup() */ #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c index 2af4c6e..4fcd612 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c @@ -21,24 +21,18 @@ * DEALINGS IN THE SOFTWARE. * *****************************************************************************/ + + #include #include - #include "vsi_nn_types.h" -#include "vsi_nn_platform.h" #include "vsi_nn_log.h" -#include "vsi_nn_graph.h" #include "vsi_nn_node.h" #include "vsi_nn_prv.h" #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" -#include "vsi_nn_tensor_util.h" -#include "vsi_nn_internal_node.h" -#include "vsi_nn_rnn_helper.h" -#include "utils/vsi_nn_math.h" -#include "utils/vsi_nn_tensor_op.h" #include "utils/vsi_nn_util.h" -#include "ops/vsi_nn_op_grucell_activation.h" +#include "kernel/vsi_nn_kernel.h" typedef struct _vsi_nn_grucell_activation_local { void * placeholder; @@ -51,8 +45,28 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - return vsi_nn_internal_compute_node( self ); -} + vsi_nn_grucell_activation_param* p = &self->nn_param.grucell_activation; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t* param; + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32(param, "activation", p->activation); + vsi_nn_kernel_param_add_int32(param, "recurrent_activation", p->recurrent_activation); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "grucell_reset_after_activation", + inputs, GRUCELL_ACT_IN_CNT, + outputs, GRUCELL_ACT_OUT_CNT, + param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ static vsi_bool op_check ( @@ -61,8 +75,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + /*TODO: Check tensor shapes. */ return TRUE; -} +} /* op_check() */ static vsi_bool op_setup ( @@ -71,110 +86,43 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_bool use_virtual_tensor= TRUE; - vsi_nn_grucell_activation_param * p = &self->nn_param.grucell_activation; - vsi_nn_internal_tensor_t * tmp_sub = NULL, * tmp_add = NULL, * tmp_mul = NULL; - vsi_nn_internal_tensor_t * tmp_act = NULL; - vsi_nn_internal_node_t * curr = NULL; - - vsi_nn_internal_init_node_wksp( self ); - - if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num) + if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num) { - outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num = 2; - outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.size[0] = inputs[GRUCELL_ACT_IN_H_STATE]->attr.size[0]; - outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.size[1] = inputs[GRUCELL_ACT_IN_H_STATE]->attr.size[1]; + outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num = \ + inputs[GRUCELL_ACT_H_STATE]->attr.dim_num; + + memcpy( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.size, + inputs[GRUCELL_ACT_H_STATE]->attr.size, + inputs[GRUCELL_ACT_H_STATE]->attr.dim_num * sizeof(vsi_size_t) ); } - /* - hht = activation(fc_h + ht) - */ - tmp_add = vsi_nn_rnn_create_binary_operator( - self, - VSI_NN_OP_ADD, - inputs[GRUCELL_ACT_IN_INPUT_FC_H], - inputs[GRUCELL_ACT_IN_H_T], - &inputs[GRUCELL_ACT_IN_INPUT_FC_H]->attr.dtype, - use_virtual_tensor - ); - tmp_act = vsi_nn_rnn_create_activation( - self, - tmp_add->t, - p->activation, - &tmp_add->t->attr.dtype, - use_virtual_tensor - ); + if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_H_STATE]->attr.dim_num) + { + outputs[GRUCELL_ACT_OUT_H_STATE]->attr.dim_num = \ + inputs[GRUCELL_ACT_H_STATE]->attr.dim_num; - /* - new_h = zt * (hstate - hht) + hht - */ - tmp_sub = vsi_nn_rnn_create_binary_operator( - self, - VSI_NN_OP_SUBTRACT, - inputs[GRUCELL_ACT_IN_H_STATE], - tmp_act->t, - &tmp_act->t->attr.dtype, - use_virtual_tensor - ); - tmp_mul = vsi_nn_rnn_create_binary_operator( - self, - VSI_NN_OP_MULTIPLY, - inputs[GRUCELL_ACT_IN_Z_T], - tmp_sub->t, - &tmp_sub->t->attr.dtype, - use_virtual_tensor - ); - - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); - curr->inputs[0] = tmp_mul->t; - curr->inputs[1] = tmp_act->t; - curr->outputs[0] = outputs[GRUCELL_ACT_OUT_OUTPUT]; - vsi_nn_internal_setup_node(self, curr); - - /* copy outputs to h_state */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); - curr->inputs[0] = outputs[GRUCELL_ACT_OUT_OUTPUT]; - curr->outputs[0] = outputs[GRUCELL_ACT_OUT_H_STATE]; - vsi_nn_internal_setup_node(self, curr); + memcpy( outputs[GRUCELL_ACT_OUT_H_STATE]->attr.size, + inputs[GRUCELL_ACT_H_STATE]->attr.size, + inputs[GRUCELL_ACT_H_STATE]->attr.dim_num * sizeof(vsi_size_t) ); + } return TRUE; -} +} /* op_setup() */ -static vsi_status op_deinit - ( - vsi_nn_node_t * self - ) -{ - return VSI_SUCCESS; -} +__BEGIN_DECLS -static vsi_status op_optimize - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs, - vsi_nn_opt_direction_e direction - ) -{ - return vsi_nn_internal_optimize_node( self, direction ); -} - -#ifdef __cplusplus -extern "C" { -#endif /* Registrar */ DEF_OP_REG ( /* op_name */ GRUCELL_ACTIVATION, /* init */ NULL, /* compute */ op_compute, - /* deinit */ op_deinit, + /* deinit */ vsi_nn_op_common_deinit, /* check */ op_check, /* setup */ op_setup, - /* optimize */ op_optimize, + /* optimize */ NULL, /* input_num */ GRUCELL_ACT_IN_CNT, /* output_num */ GRUCELL_ACT_OUT_CNT ); -#ifdef __cplusplus -} -#endif + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c new file mode 100644 index 0000000..46eff0d --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c @@ -0,0 +1,129 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _grucell_activation_z_h_local_data_t { + int32_t placeholder; +} grucell_activation_z_h_local_data_t; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_activation_param* p = &self->nn_param.grucell_activation; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t* param; + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32(param, "activation", p->activation); + vsi_nn_kernel_param_add_int32(param, "recurrent_activation", p->recurrent_activation); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "grucell_activation_z_h", + inputs, GRUCELL_ACT_Z_H_IN_CNT, + outputs, GRUCELL_ACT_Z_H_OUT_CNT, + param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num) + { + outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num = \ + inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dim_num; + + memcpy( outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.size, + inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.size, + inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dim_num * sizeof(vsi_size_t) ); + } + + if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]->attr.dim_num) + { + outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]->attr.dim_num = \ + inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dim_num; + + memcpy( outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]->attr.size, + inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.size, + inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dim_num * sizeof(vsi_size_t) ); + } + + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GRUCELL_ACTIVATION_Z_H, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ GRUCELL_ACT_Z_H_IN_CNT, + /* output_num */ GRUCELL_ACT_Z_H_OUT_CNT + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c new file mode 100644 index 0000000..e1e4480 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c @@ -0,0 +1,124 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _grucell_h_times_activation_r_local_data_t { + int32_t placeholder; +} grucell_h_times_activation_r_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (3) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_h_times_activation_r_param* p = &self->nn_param.grucell_h_times_activation_r; + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t* param; + + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32(param, "recurrent_activation", p->recurrent_activation); + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "grucell_h_times_activation_r", + inputs, _INPUT_NUM, + outputs, _OUTPUT_NUM, + param ); + + if( self->n ) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) + { + outputs[0]->attr.dim_num = \ + inputs[0]->attr.dim_num; + + memcpy( outputs[0]->attr.size, + inputs[0]->attr.size, + inputs[0]->attr.dim_num * sizeof(vsi_size_t) ); + } + + return TRUE; +} /* op_setup() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GRUCELL_H_TIMES_ACTIVATION_R, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c index 1814c51..31df29c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c @@ -1022,6 +1022,7 @@ static vsi_bool op_setup_default && (p->local->multi_batch)) { vsi_nn_tensor_t* wei_r2c_tensor = NULL; + vsi_nn_tensor_t* bias_r2c_tensor = NULL; memcpy(&attr, &(inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr), sizeof(attr)); attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; @@ -1036,10 +1037,12 @@ static vsi_bool op_setup_default } wei_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_WEIGHT_H2C], &(attr.dtype)); + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; + bias_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_BIAS_H2C], &(attr.dtype)); rh_cand_fc_output = vsi_nn_rnn_create_tp_fc(self, rh_mul_outputs->t, wei_r2c_tensor, - inputs[GRUCELL_INPUT_BIAS_H2C], + bias_r2c_tensor, &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C], use_virtual_tensor); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c index ce7290d..ed652c3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c @@ -130,7 +130,7 @@ static vsi_status op_compute } } - param =vsi_nn_kernel_param_create(); + param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_float32( param, "eps", eps ); vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg ); n = vsi_nn_kernel_selector( self->graph, "instance_norm", @@ -172,8 +172,8 @@ static vsi_status op_optimize /* insert a reshape node before and after 3D instance_norm */ - shape[0] = 1; - shape[1] = inputs[0]->attr.size[0]; + shape[0] = inputs[0]->attr.size[0]; + shape[1] = 1; shape[2] = inputs[0]->attr.size[1]; shape[3] = inputs[0]->attr.size[2]; dim = 4; @@ -320,4 +320,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c index 055dbd9..2df9bc2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c @@ -32,6 +32,7 @@ #include "utils/vsi_nn_math.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_constraint_check.h" +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute ( @@ -41,81 +42,24 @@ static vsi_status op_compute ) { vsi_status status = VX_FAILURE; -#ifdef VX_L2NORM_AXIS_PARAMETER_SUPPORT - vx_nn_l2norm_params_t param; + int32_t axis = self->nn_param.l2_normalize.axis; + vsi_nn_kernel_param_t * param = NULL; - param.axis = self->nn_param.l2_normalize.axis; + param = vsi_nn_kernel_param_create(); + vsi_nn_kernel_param_add_int32( param, "axis", axis ); - self->n = vxL2NormalizeLayer2( - self->graph->g, - inputs[0]->t, - ¶m, - sizeof(vx_nn_l2norm_params_t), - outputs[0]->t - ); - - if( NULL != self->n ) - { - status = VSI_SUCCESS; - } -#else - vsi_nn_l2_normalize_param * p; - int32_t axis = -1; - uint32_t i = 0; - uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; - uint32_t innerSize = 1; - uint32_t outerSize = 1; - uint32_t axisSize = 1; - vx_tensor vx_input = NULL; - vx_tensor vx_output = NULL; - vx_tensor input = inputs[0]->t; - vx_tensor output = outputs[0]->t; - - status = VSI_FAILURE; - - p = &(self->nn_param.l2_normalize); - axis = p->axis; - - if (axis != 2) - { - axisSize = inputs[0]->attr.size[axis]; - - for (i = 0; i < (uint32_t)axis; i++) - { - innerSize *= inputs[0]->attr.size[i]; - } - - for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++) - { - outerSize *= inputs[0]->attr.size[i]; - } - - sizes[0] = innerSize; - sizes[1] = 1; - sizes[2] = axisSize; - sizes[3] = outerSize; - - vx_input = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); - vx_output = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); - - input = vx_input; - output = vx_output; - } - - self->n = vxL2NormalizeLayer( - self->graph->g, - input, - output - ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "l2_norm", + inputs, 1, + outputs, 1, param );; if( NULL != self->n ) { status = VSI_SUCCESS; } - if (vx_input) vxReleaseTensor(&vx_input); - if (vx_output) vxReleaseTensor(&vx_output); -#endif + vsi_nn_kernel_param_release( ¶m ); + return status; } /* op_compute() */ @@ -189,4 +133,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c index 50bdef0..69e27a1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include @@ -36,7 +35,6 @@ #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" - #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) @@ -47,23 +45,23 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_param_t * param; + vsi_nn_kernel_node_t n; - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LINEAR, - self->nn_param.linear.a, - self->nn_param.linear.b, - outputs[0]->t - ); + param = vsi_nn_kernel_param_create(); - if( NULL != self->n ) + vsi_nn_kernel_param_add_float32( param, "a_v", self->nn_param.linear.a ); + vsi_nn_kernel_param_add_float32( param, "b_v", self->nn_param.linear.b ); + + n = vsi_nn_kernel_selector( self->graph, "linear", inputs, 1, outputs, 1, param ); + if( n == NULL ) { - status = VSI_SUCCESS; + status = VSI_FAILURE; } + vsi_nn_kernel_param_release( ¶m ); + return status; } /* op_compute() */ @@ -103,7 +101,6 @@ static vsi_bool op_check return TRUE; } /* op_check() */ - __BEGIN_DECLS /* Registrar */ @@ -121,4 +118,3 @@ DEF_OP_REG ); __END_DECLS - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c index ee65331..fd3e610 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c @@ -86,13 +86,10 @@ static vsi_status op_compute if(outerSize < MAX_BATCH_COUNT) { -#ifdef VSI_40BIT_VA_SUPPORT - vx_input = vxReshapeTensor(inputs[0]->t, sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); - vx_output = vxReshapeTensor(outputs[0]->t, sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); -#else - vx_input = vxReshapeTensor(inputs[0]->t, (int32_t*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); - vx_output = vxReshapeTensor(outputs[0]->t, (int32_t*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); -#endif + vx_input = vsi_nn_safe_reshape_tensor(inputs[0]->t, + (void*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4), sizeof(sizes[0])); + vx_output = vsi_nn_safe_reshape_tensor(outputs[0]->t, + (void*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4), sizeof(sizes[0])); input = vx_input; output = vx_output; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c index 3db70e8..a6c5c63 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c @@ -98,7 +98,6 @@ static vsi_status op_compute } return status; - } /* op_compute() */ static vsi_bool op_check @@ -136,7 +135,14 @@ static vsi_bool op_setup p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL; p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL; - p->is_layer_norm = inputs[LSTMUNIT_ACT_LN_WF] != NULL; + if (self->graph->ctx->config.support_stream_processor) + { + p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL; + } + else + { + p->is_layer_norm = inputs[LSTMUNIT_ACT_LN_WF] != NULL; + } p->is_hybrid = p->is_layer_norm ? 0 : inputs[LSTMUNIT_ACT_DATA_BF] != NULL; p->recurrent_activation = p->recurrent_activation == VSI_NN_ACT_NONE ? VSI_NN_ACT_SIGMOID : p->recurrent_activation; @@ -221,7 +227,6 @@ static vsi_bool op_setup } return TRUE; - } /* op_setup() */ static vsi_status op_deinit @@ -229,7 +234,6 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - vsi_status status = VSI_SUCCESS; int32_t i = 0; @@ -249,7 +253,6 @@ static vsi_status op_deinit } return status; - } /* op_deinit() */ static vsi_status op_init @@ -257,13 +260,11 @@ static vsi_status op_init vsi_nn_node_t * self ) { - vsi_status status = VSI_SUCCESS; self->nn_param.lstmunit_activation.recurrent_activation = VSI_NN_ACT_SIGMOID; return status; - } /* op_init() */ #ifdef __cpluplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c index d5d5123..5433281 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c @@ -41,7 +41,6 @@ #include "vsi_nn_rnn_helper.h" #include "utils/vsi_nn_dtype_util.h" - static vsi_nn_internal_tensor_t* create_tp_fc ( vsi_nn_node_t * self, @@ -54,18 +53,13 @@ static vsi_nn_internal_tensor_t* create_tp_fc { vsi_nn_lstmunit_ovxlib_param* p = &self->nn_param.lstmunit_ovxlib; vsi_nn_tensor_attr_t attr; - vsi_nn_tensor_t* tensor = NULL; - vsi_nn_internal_tensor_t* tensor1 = NULL; vsi_nn_internal_tensor_t* tensor2 = NULL; vsi_nn_internal_node_t* tmp_inode = NULL; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - tensor = bias; if( !bias || p->local->use_layer_norm || p->local->use_hybrid ) { - /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); - tensor = tensor1->t; + bias = NULL; } vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); @@ -77,7 +71,7 @@ static vsi_nn_internal_tensor_t* create_tp_fc tmp_inode->inputs[0] = input; tmp_inode->inputs[1] = weight; - tmp_inode->inputs[2] = tensor; + tmp_inode->inputs[2] = bias; tmp_inode->outputs[0] = tensor2->t; vsi_nn_internal_setup_node(self, tmp_inode); @@ -98,21 +92,15 @@ static vsi_nn_internal_tensor_t* create_nn_fc { vsi_nn_lstmunit_ovxlib_param* p = &self->nn_param.lstmunit_ovxlib; vsi_nn_tensor_attr_t attr; - vsi_nn_tensor_t* tensor = NULL; - vsi_nn_internal_tensor_t* tensor1 = NULL; vsi_nn_internal_tensor_t* tensor2 = NULL; vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL; vsi_size_t reshaped_weight_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_nn_internal_node_t* tmp_inode = NULL; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - tensor = bias; if( !bias || p->local->use_layer_norm || p->local->use_hybrid ) { - /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor( - self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); - tensor = tensor1->t; + bias = NULL; } vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); @@ -149,7 +137,7 @@ static vsi_nn_internal_tensor_t* create_nn_fc tmp_inode->inputs[0] = input; tmp_inode->inputs[1] = reshaped_weight_tensor->t; - tmp_inode->inputs[2] = tensor; + tmp_inode->inputs[2] = bias; tmp_inode->outputs[0] = tensor2->t; vsi_nn_internal_setup_node(self, tmp_inode); @@ -284,6 +272,7 @@ static vsi_bool op_setup vsi_nn_tensor_attr_t attr; vsi_bool is_input_fc_on_tp = FALSE; vsi_bool is_recurrent_fc_on_tp = FALSE; + vsi_nn_internal_tensor_t* add_tensor = NULL; vsi_nn_internal_tensor_t* input_tensor = NULL; vsi_nn_internal_tensor_t* output_tensor = NULL; vsi_nn_internal_tensor_t* recurrent_input_tensor = NULL; @@ -509,23 +498,54 @@ static vsi_bool op_setup { for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ ) { - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); - attr.dim_num = VSI_NN_DIM_AUTO; - attr.vtl = use_virtual_tensor; - attr.is_const = FALSE; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; - input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + if (self->graph->ctx->config.support_stream_processor) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + add_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + /* create internal nodes */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + curr->inputs[0] = input_fc_outputs[i]->t; + curr->inputs[1] = recurrent_fc_outputs[i]->t; + curr->outputs[0] = add_tensor->t; + vsi_nn_internal_setup_node(self, curr); - /* create internal nodes */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 ); - curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8; - curr->inputs[0] = input_fc_outputs[i]->t; - curr->inputs[1] = recurrent_fc_outputs[i]->t; - curr->outputs[0] = input_tensor->t; - vsi_nn_internal_setup_node(self, curr); + /* create internal nodes */ + input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LAYER_NORM, 0, 0 ); + curr->node->nn_param.layernorm.eps = (float)1e-8; + curr->inputs[0] = add_tensor->t; + curr->inputs[1] = inputs[LSTMUNIT_INPUT_BIAS_I + i]; + curr->inputs[2] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i]; + curr->outputs[0] = input_tensor->t; + vsi_nn_internal_setup_node(self, curr); - layernorm_outputs[i] = input_tensor; + layernorm_outputs[i] = input_tensor; + } + else + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + /* create internal nodes */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 ); + curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8; + curr->inputs[0] = input_fc_outputs[i]->t; + curr->inputs[1] = recurrent_fc_outputs[i]->t; + curr->outputs[0] = input_tensor->t; + vsi_nn_internal_setup_node(self, curr); + + layernorm_outputs[i] = input_tensor; + } } } @@ -544,7 +564,8 @@ static vsi_bool op_setup curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = inputs[LSTMUNIT_INPUT_C_STATE]; for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ ) { - if( p->local->use_layer_norm || p->local->use_hybrid ) + if( (p->local->use_layer_norm && !self->graph->ctx->config.support_stream_processor) || + p->local->use_hybrid ) { curr->inputs[LSTMUNIT_ACT_DATA_BI + i] = inputs[LSTMUNIT_INPUT_BIAS_I + i]; } @@ -552,7 +573,14 @@ static vsi_bool op_setup if( p->local->use_layer_norm ) { /* Pass layernorm weights to VSI_NN_OP_LSTMUNIT_ACTIVATION */ - curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i]; + if (self->graph->ctx->config.support_stream_processor) + { + curr->inputs[LSTMUNIT_ACT_LN_WI + i] = NULL; + } + else + { + curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i]; + } curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = layernorm_outputs[i]->t; curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = NULL; } @@ -644,17 +672,7 @@ static vsi_bool op_setup curr->inputs[2] = zero_bias_tensor; /* Save output to h_state first and copy to output */ - if( p->local->use_hybrid && p->local->use_projection_bias ) - { - vsi_nn_internal_init_tensor_attr(&attr, - &outputs[LSTMUNIT_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); - output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); - curr->outputs[0] = output_tensor->t; - } - else - { - curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; - } + curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE]; vsi_nn_internal_setup_node(self, curr); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c index eaeaaa5..fcf29f4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c @@ -30,6 +30,7 @@ #include "vsi_nn_prv.h" #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" @@ -51,6 +52,13 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; + vsi_nn_tensor_t * tmp_inputs[2] = {NULL}; + vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; + vsi_nn_tensor_t * rs_input = NULL; + vsi_nn_tensor_t * rs_output = NULL; + vsi_size_t shape_in[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + vsi_size_t shape_out[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + uint32_t i = 0; int32_t transposeA = self->nn_param.matrixmul.transpose[0]; int32_t transposeB = self->nn_param.matrixmul.transpose[1]; @@ -64,7 +72,47 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA ); vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB ); - n = vsi_nn_kernel_selector( self->graph, "matrixmul", inputs, 2, outputs, 1, param ); + if (inputs[0]->attr.dim_num == 1 && inputs[1]->attr.dim_num > 1) + { + shape_in[0] = inputs[0]->attr.size[0]; + shape_in[1] = 1; + shape_out[0] = outputs[0]->attr.size[0]; + shape_out[1] = 1; + for(i = 2; i <= outputs[0]->attr.dim_num; i++) + { + shape_out[i] = outputs[0]->attr.size[i - 1]; + } + rs_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape_in, 2); + rs_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1); + tmp_inputs[0] = rs_input; + tmp_inputs[1] = inputs[1]; + tmp_outputs[0] = rs_output; + } + else if (inputs[1]->attr.dim_num == 1 && inputs[0]->attr.dim_num > 1) + { + shape_in[0] = 1; + shape_in[1] = inputs[1]->attr.size[0]; + + shape_out[0] = 1; + for(i = 1; i <= outputs[0]->attr.dim_num; i++) + { + shape_out[i] = outputs[0]->attr.size[i - 1]; + } + rs_input = vsi_nn_reshape_tensor(self->graph, inputs[1], shape_in, 2); + rs_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1); + + tmp_inputs[0] = inputs[0]; + tmp_inputs[1] = rs_input; + tmp_outputs[0] = rs_output; + } + else + { + tmp_inputs[0] = inputs[0]; + tmp_inputs[1] = inputs[1]; + tmp_outputs[0] = outputs[0]; + } + + n = vsi_nn_kernel_selector( self->graph, "matrixmul", tmp_inputs, 2, tmp_outputs, 1, param ); if ( n != NULL ) { self->n = (vx_node)n; @@ -76,6 +124,15 @@ static vsi_status op_compute vsi_nn_kernel_param_release( ¶m ); } + if (rs_input != NULL) + { + vsi_nn_ReleaseTensor( &rs_input ); + } + if (rs_output != NULL) + { + vsi_nn_ReleaseTensor( &rs_output ); + } + return status; } /* op_compute() */ @@ -126,23 +183,32 @@ static vsi_bool op_check return FALSE; } - if (self->nn_param.matrixmul.transpose[0] == FALSE + if ((inputs[0]->attr.dim_num == 1 || inputs[1]->attr.dim_num == 1) + && (self->nn_param.matrixmul.transpose[0] == TRUE || self->nn_param.matrixmul.transpose[1] == TRUE)) + { + VSILOGE("Transpose parameters should be all false when input tensor is 1D"); + return FALSE; + } + else if (self->nn_param.matrixmul.transpose[0] == FALSE && self->nn_param.matrixmul.transpose[1] == FALSE - && inputs[0]->attr.size[0] != inputs[1]->attr.size[1]) + && inputs[0]->attr.size[0] != inputs[1]->attr.size[1] + && inputs[0]->attr.dim_num > 1 && inputs[1]->attr.dim_num > 1) { VSILOGE("1st input tensor's size[0] is not equal to 2nd input tensor's size[1]"); return FALSE; } else if (self->nn_param.matrixmul.transpose[0] == TRUE && self->nn_param.matrixmul.transpose[1] == FALSE - && inputs[0]->attr.size[1] != inputs[1]->attr.size[1]) + && inputs[0]->attr.size[1] != inputs[1]->attr.size[1] + && inputs[0]->attr.dim_num > 1 && inputs[1]->attr.dim_num > 1) { VSILOGE("1st input tensor's size[1] is not equal to 2nd input tensor's size[1]"); return FALSE; } else if (self->nn_param.matrixmul.transpose[0] == FALSE && self->nn_param.matrixmul.transpose[1] == TRUE - && inputs[0]->attr.size[0] != inputs[1]->attr.size[0]) + && inputs[0]->attr.size[0] != inputs[1]->attr.size[0] + && inputs[0]->attr.dim_num > 1 && inputs[1]->attr.dim_num > 1) { VSILOGE("1st input tensor's size[0] is not equal to 2nd input tensor's size[0]"); return FALSE; @@ -195,7 +261,25 @@ static vsi_bool op_setup return FALSE; } - if (inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) + if (inputs[0]->attr.dim_num == 1 && inputs[1]->attr.dim_num > 1) + { + outputs[0]->attr.dim_num = inputs[1]->attr.dim_num - 1; + outputs[0]->attr.size[0] = inputs[1]->attr.size[0]; + for (i = 1; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[1]->attr.size[i + 1]; + } + } + else if (inputs[1]->attr.dim_num == 1 && inputs[0]->attr.dim_num > 1) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - 1; + + for (i = 0; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1]; + } + } + else if (inputs[0]->attr.dim_num > inputs[1]->attr.dim_num) { for (i = 2; i < inputs[0]->attr.dim_num; i++) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c index 2ae7605..a8d7b30 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c @@ -35,6 +35,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_math.h" #include "utils/vsi_nn_constraint_check.h" +#include "utils/vsi_nn_dtype_util.h" vsi_status vsi_nn_InitPadParameter ( @@ -127,6 +128,7 @@ static vsi_status op_compute { vsi_status status; vx_nn_pad_params_t p; + vsi_nn_tensor_t *convert_tensor = NULL; status = VSI_FAILURE; if(VSI_SUCCESS != vsi_nn_InitPadParameter(self, &p)) @@ -135,20 +137,43 @@ static vsi_status op_compute return VSI_FAILURE; } + if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) + { + vsi_nn_tensor_attr_t attr; + memcpy( &attr, &outputs[0]->attr, sizeof( attr ) ); + memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) ); + attr.vtl = FALSE; + attr.is_const = FALSE; + + convert_tensor = vsi_nn_CreateTensor(self->graph, &attr); + + self->n = vxTensorCopyNode( + self->graph->g, + inputs[0]->t, + convert_tensor->t + ); + } + else + { + convert_tensor = vsi_nn_reshape_tensor( self->graph, + inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num ); + } self->n = vxTensorPadNode( self->graph->g, - inputs[0]->t, + convert_tensor->t, outputs[0]->t, &p, sizeof(p) ); vsi_nn_DeinitPadParameter(&p); + vsi_safe_release_tensor(convert_tensor); if( NULL != self->n ) { status = VSI_SUCCESS; } + return status; } /* op_compute() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c index a5ce4f9..f1386c7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c @@ -254,13 +254,8 @@ static vsi_status op_optimize { if(NULL == inputs[0]->t && NULL != outputs[0]->t) { -#ifdef VSI_40BIT_VA_SUPPORT - inputs[0]->t = vxReshapeTensor( outputs[0]->t, - inputs[0]->attr.size, inputs[0]->attr.dim_num ); -#else - inputs[0]->t = vxReshapeTensor( outputs[0]->t, - (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num ); -#endif + inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t, + (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, sizeof(inputs[0]->attr.size[0]) ); if( inputs[0]->t == NULL ) { status = VSI_FAILURE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c index a198d32..18942fa 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c @@ -151,9 +151,9 @@ static vsi_bool op_setup if (self->nn_param.post_process.local.enable_data_conv == FALSE && self->nn_param.post_process.local.enable_perm == FALSE) { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); - curr->node->nn_param.reshape.size = outputs[0]->attr.size; - curr->node->nn_param.reshape.dim_num = outputs[0]->attr.dim_num; + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + curr->node->nn_param.reshape2.size = outputs[0]->attr.size; + curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; curr->inputs[0] = inputs[POST_PROCESS_INPUT]; curr->outputs[0] = outputs[POST_PROCESS_OUTPUT]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index 9f0a995..6a955a5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -315,7 +315,7 @@ static vsi_bool op_setup memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t)); for(i = 0; i < p->output_attr.dim_num; i++) { - attr.size[i] = (vsi_size_t)p->output_attr.size[i]; + attr.size[i] = -1 == p->output_attr.size[i] ? -1 : (vsi_size_t)p->output_attr.size[i]; } attr.size[axis] = 1; attr.vtl = TRUE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c index 31818ee..d264ee7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c @@ -51,7 +51,7 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; - param =vsi_nn_kernel_param_create(); + param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_gray.local.scale_x ); vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_gray.local.scale_y ); @@ -60,6 +60,9 @@ static vsi_status op_compute vsi_nn_kernel_param_add_float32( param, "mean", self->nn_param.pre_process_gray.mean ); vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.pre_process_gray.scale ); vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_gray.local.enable_copy ); + vsi_nn_kernel_param_add_int32( param, "width", self->nn_param.pre_process_gray.rect.width ); + vsi_nn_kernel_param_add_int32( param, "height", self->nn_param.pre_process_gray.rect.height ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_gray", inputs, 1, outputs, 1, param ); if( n != NULL ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c index ba50f33..b4220a7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c @@ -151,9 +151,9 @@ static vsi_bool op_setup if (self->nn_param.pre_process_tensor.local.enable_data_conv == FALSE && self->nn_param.pre_process_tensor.local.enable_perm == FALSE) { - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); - curr->node->nn_param.reshape.size = outputs[0]->attr.size; - curr->node->nn_param.reshape.dim_num = outputs[0]->attr.dim_num; + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + curr->node->nn_param.reshape2.size = outputs[0]->attr.size; + curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT]; curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c index 3ef8224..b66a5cf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c @@ -292,19 +292,7 @@ static vsi_status op_optimize { size[2] = outputs[0]->attr.size[0]; size[3] = outputs[0]->attr.size[1]; -#ifdef VSI_40BIT_VA_SUPPORT - rois_tmp = vxReshapeTensor(outputs[0]->t, size, dim); -#else - { - vsi_size_t i; - int32_t size_32bit[VSI_NN_MAX_DIM_NUM]; - for(i = 0; i< VSI_NN_MAX_DIM_NUM; i++) - { - size_32bit[i] = (int32_t)size[i]; - } - rois_tmp = vxReshapeTensor(outputs[0]->t, size_32bit, dim); - } -#endif + rois_tmp = vsi_nn_safe_reshape_tensor(outputs[0]->t, (void*)size, (vsi_size_t)dim, sizeof(size[0])); if(NULL == rois_tmp) { goto error; @@ -317,19 +305,7 @@ static vsi_status op_optimize { size[2] = outputs[1]->attr.size[0]; size[3] = outputs[1]->attr.size[1]; -#ifdef VSI_40BIT_VA_SUPPORT - score_tmp = vxReshapeTensor(outputs[1]->t, size, dim); -#else - { - vsi_size_t i; - int32_t size_32bit[VSI_NN_MAX_DIM_NUM]; - for(i = 0; i< VSI_NN_MAX_DIM_NUM; i++) - { - size_32bit[i] = (int32_t)size[i]; - } - score_tmp = vxReshapeTensor(outputs[1]->t, size_32bit, dim); - } -#endif + score_tmp = vsi_nn_safe_reshape_tensor(outputs[1]->t, (void*)size, (vsi_size_t)dim, sizeof(size[0])); if(NULL == score_tmp) { goto error; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c index daeb768..d4629ec 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c @@ -33,6 +33,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" @@ -47,18 +48,29 @@ static vsi_status op_compute *If reshape is un-initialized, we need add a tensorcopy * when input and output are initialized. */ - if(inputs[0]->t != NULL && outputs[0]->t != NULL && + if (inputs[0]->t != NULL && outputs[0]->t != NULL && self->nn_param.reshape.local.initialized == FALSE) { + vsi_status status = VSI_SUCCESS; + vsi_nn_tensor_t *tmp_tensor = NULL; + + tmp_tensor = vsi_nn_reshape_tensor( self->graph, + outputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num ); + self->n = vxTensorCopyNode(self->graph->g, - inputs[0]->t, outputs[0]->t); - if(NULL == self->n) + inputs[0]->t, tmp_tensor->t); + if (NULL == self->n) { VSILOGE( "Create vxTensorCopyNode fail." ); - return VSI_FAILURE; + status = VSI_FAILURE; } VSILOGD("Create a copy node for reshape"); + + vsi_safe_release_tensor(tmp_tensor); + + return status; } + return VSI_SUCCESS; } /* op_compute() */ @@ -84,8 +96,11 @@ static vsi_bool op_setup if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; - memcpy(shape, self->nn_param.reshape.size, - sizeof(vsi_size_t) * self->nn_param.reshape.dim_num); + uint32_t i = 0; + for(i = 0; i < self->nn_param.reshape.dim_num; i++) + { + shape[i] = -1 == self->nn_param.reshape.size[i] ? -1 : (vsi_size_t)self->nn_param.reshape.size[i]; + } ret = vsi_nn_CalcReshapeTensor(inputs[0], outputs[0], shape, @@ -108,21 +123,23 @@ static vsi_status op_optimize status = VSI_SUCCESS; ret = TRUE; - if(self->nn_param.reshape.local.initialized == FALSE) + + if( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) + { + return status; + } + + if (self->nn_param.reshape.local.initialized == FALSE) { VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); - if( direction == VSI_NN_OPTIMIZE_BACKWARD ) + if ( direction == VSI_NN_OPTIMIZE_BACKWARD ) { - if(NULL == inputs[0]->t && NULL != outputs[0]->t) + if (NULL == inputs[0]->t && NULL != outputs[0]->t) { -#ifdef VSI_40BIT_VA_SUPPORT - inputs[0]->t = vxReshapeTensor( outputs[0]->t, - inputs[0]->attr.size, inputs[0]->attr.dim_num ); -#else - inputs[0]->t = vxReshapeTensor( outputs[0]->t, - (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num ); -#endif - if( inputs[0]->t == NULL ) + inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t, + (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, + sizeof(inputs[0]->attr.size[0]) ); + if ( inputs[0]->t == NULL ) { status = VSI_FAILURE; } @@ -131,11 +148,17 @@ static vsi_status op_optimize } else { - if(NULL == outputs[0]->t) + if (NULL == outputs[0]->t) { + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + uint32_t i = 0; + for (i = 0; i < self->nn_param.reshape.dim_num; i++) + { + shape[i] = -1 == self->nn_param.reshape.size[i] ? -1 : (vsi_size_t)self->nn_param.reshape.size[i]; + } ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0], - self->nn_param.reshape.size, self->nn_param.reshape.dim_num ); - if( ret == FALSE ) + shape, self->nn_param.reshape.dim_num ); + if ( ret == FALSE ) { status = VSI_FAILURE; } @@ -166,4 +189,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c new file mode 100644 index 0000000..4132004 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c @@ -0,0 +1,204 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "utils/vsi_nn_dtype_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* + *If reshape is un-initialized, we need add a tensorcopy + * when input and output are initialized. + */ + if(inputs[0]->t != NULL && outputs[0]->t != NULL && + self->nn_param.reshape2.local->initialized == FALSE) + { + self->n = vxTensorCopyNode(self->graph->g, + inputs[0]->t, outputs[0]->t); + if(NULL == self->n) + { + VSILOGE( "Create vxTensorCopyNode fail." ); + return VSI_FAILURE; + } + VSILOGD("Create a copy node for reshape"); + } + return VSI_SUCCESS; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + //TODO: Check tensor shapes. + return TRUE; +} /* op_check() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + self->nn_param.reshape2.local = + (vsi_nn_reshape2_local_data *)malloc(sizeof(vsi_nn_reshape2_local_data)); + if (NULL == self->nn_param.reshape2.local) + { + return VX_ERROR_NO_MEMORY; + } + memset(self->nn_param.reshape2.local, 0, sizeof(vsi_nn_reshape2_local_data)); + return status; +} /* op_init() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + if (self->nn_param.reshape2.local != NULL) + { + free(self->nn_param.reshape2.local); + self->nn_param.reshape2.local = NULL; + } + + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + memcpy(shape, self->nn_param.reshape2.size, + sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num); + ret = vsi_nn_CalcReshapeTensor(inputs[0], + outputs[0], + shape, + self->nn_param.reshape2.dim_num); + } + + return ret; +} /* op_setup() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + vsi_status status; + vsi_bool ret; + + status = VSI_SUCCESS; + ret = TRUE; + if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) + { + return status; + } + + if (self->nn_param.reshape2.local->initialized == FALSE) + { + VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); + if ( direction == VSI_NN_OPTIMIZE_BACKWARD ) + { + if (NULL == inputs[0]->t && NULL != outputs[0]->t) + { + inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t, + (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, + sizeof(inputs[0]->attr.size[0]) ); + if ( inputs[0]->t == NULL ) + { + status = VSI_FAILURE; + } + self->nn_param.reshape2.local->initialized = TRUE; + } + } + else + { + if (NULL == outputs[0]->t) + { + ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0], + self->nn_param.reshape2.size, self->nn_param.reshape2.dim_num ); + if ( ret == FALSE ) + { + status = VSI_FAILURE; + } + self->nn_param.reshape2.local->initialized = TRUE; + } + } + } + + return status; +} /* op_optimize() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ RESHAPE2, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ 1, + /* output_num */ 1 + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c index 9454c42..ad39a8b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c @@ -81,10 +81,7 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; - if ( ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) - && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type - || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type)) - || _is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num) ) + if ( self->nn_param.resize.lcl_data->use_internal_node ) { status = vsi_nn_internal_compute_node( self ); } @@ -121,10 +118,7 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { - if ( ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) - && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type - || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type) ) - || _is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num) ) + if ( self->nn_param.resize.lcl_data->use_internal_node ) { return vsi_nn_internal_optimize_node(self, direction ); } @@ -154,6 +148,7 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ float factor = self->nn_param.resize.factor; + vsi_enum layout = self->nn_param.resize.layout; vsi_nn_internal_node_t* curr = NULL; if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) @@ -161,26 +156,55 @@ static vsi_bool op_setup outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; if (factor != 0) { - outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); - outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + if (layout == VSI_NN_RESIZE_LAYOUT_NCHW) + { + outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); + outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + } + else + { + outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + outputs[0]->attr.size[2] = (uint32_t)(inputs[0]->attr.size[2] * factor); + } } else { - outputs[0]->attr.size[0] = self->nn_param.resize.size[0]; - outputs[0]->attr.size[1] = self->nn_param.resize.size[1]; + if (layout == VSI_NN_RESIZE_LAYOUT_NCHW) + { + outputs[0]->attr.size[0] = self->nn_param.resize.size[0]; + outputs[0]->attr.size[1] = self->nn_param.resize.size[1]; + } + else + { + outputs[0]->attr.size[1] = self->nn_param.resize.size[0]; + outputs[0]->attr.size[2] = self->nn_param.resize.size[1]; + } + } + if (layout == VSI_NN_RESIZE_LAYOUT_NCHW) + { + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + else + { + outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; } - outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; - outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; } - if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) - && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type)) + if ( ( self->nn_param.resize.align_corners || + self->nn_param.resize.half_pixel_centers || + layout == VSI_NN_RESIZE_LAYOUT_NHWC ) + && ( VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type ) ) { + self->nn_param.resize.lcl_data->use_internal_node = TRUE; + vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 0, 0 ); curr->node->nn_param.resize_internal.align_corners = self->nn_param.resize.align_corners; curr->node->nn_param.resize_internal.factor = self->nn_param.resize.factor; curr->node->nn_param.resize_internal.half_pixel_centers = self->nn_param.resize.half_pixel_centers; + curr->node->nn_param.resize_internal.layout = self->nn_param.resize.layout; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; vsi_nn_internal_setup_node(self, curr); @@ -188,6 +212,8 @@ static vsi_bool op_setup else if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) && (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type)) { + self->nn_param.resize.lcl_data->use_internal_node = TRUE; + vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_NEAREST_INTERNAL, 0, 0 ); curr->node->nn_param.resize_nearest_internal.align_corners = self->nn_param.resize.align_corners; @@ -199,6 +225,8 @@ static vsi_bool op_setup } else if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num)) { + self->nn_param.resize.lcl_data->use_internal_node = TRUE; + vsi_nn_internal_init_node_wksp( self ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); curr->inputs[0] = inputs[0]; @@ -214,14 +242,15 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) - && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type - || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type)) + + if (self->nn_param.resize.lcl_data->use_internal_node) { + vsi_nn_safe_free(self->nn_param.resize.lcl_data); vsi_nn_internal_deinit_node_wksp(self); } else { + vsi_nn_safe_free(self->nn_param.resize.lcl_data); vsi_nn_op_common_deinit(self); } @@ -235,12 +264,25 @@ static vsi_status op_init { vsi_status status = VSI_SUCCESS; + self->nn_param.resize.lcl_data = + (vsi_nn_resize_local_data *)malloc( sizeof(vsi_nn_resize_local_data) ); + if( NULL == self->nn_param.resize.lcl_data ) + { + VSILOGE( "Create resize local data fail." ); + status = VSI_FAILURE; + goto final; + } + memset( self->nn_param.resize.lcl_data, 0, sizeof(vsi_nn_resize_local_data) ); + if (vsi_nn_compareVersion(self->graph, 1, 1, 14) == -1) { self->nn_param.resize.align_corners = FALSE; self->nn_param.resize.half_pixel_centers = FALSE; } + self->nn_param.resize.layout = VSI_NN_RESIZE_LAYOUT_NCHW; + +final: return status; } /* op_init() */ @@ -263,4 +305,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c index bd761d0..efa21d6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c @@ -49,10 +49,10 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VSI_FAILURE; int32_t align_corners = self->nn_param.resize_internal.align_corners; int32_t half_pixel_centers = self->nn_param.resize_internal.half_pixel_centers; + vsi_enum layout = self->nn_param.resize_internal.layout; vsi_nn_kernel_param_t * param = NULL; param = vsi_nn_kernel_param_create(); @@ -60,10 +60,20 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners ); vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers ); - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, - "resize_bilinear", - &inputs[0], 1, - &outputs[0], 1, param ); + if (layout == VSI_NN_RESIZE_LAYOUT_NCHW) + { + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "resize_bilinear", + &inputs[0], 1, + &outputs[0], 1, param ); + } + else + { + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + "resize_bilinear_nhwc", + &inputs[0], 1, + &outputs[0], 1, param ); + } if( self->n ) { @@ -73,7 +83,6 @@ static vsi_status op_compute vsi_nn_kernel_param_release( ¶m ); return status; - } /* op_compute() */ static vsi_bool op_check @@ -113,22 +122,47 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ float factor = self->nn_param.resize_internal.factor; + vsi_enum layout = self->nn_param.resize_internal.layout; if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; if (factor != 0) { - outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); - outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + if (layout == VSI_NN_RESIZE_LAYOUT_NCHW) + { + outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor); + outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + } + else + { + outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor); + outputs[0]->attr.size[2] = (uint32_t)(inputs[0]->attr.size[2] * factor); + } } else { - outputs[0]->attr.size[0] = self->nn_param.resize.size[0]; - outputs[0]->attr.size[1] = self->nn_param.resize.size[1]; + if (layout == VSI_NN_RESIZE_LAYOUT_NCHW) + { + outputs[0]->attr.size[0] = self->nn_param.resize.size[0]; + outputs[0]->attr.size[1] = self->nn_param.resize.size[1]; + } + else + { + outputs[0]->attr.size[1] = self->nn_param.resize.size[0]; + outputs[0]->attr.size[2] = self->nn_param.resize.size[1]; + } + } + if (layout == VSI_NN_RESIZE_LAYOUT_NCHW) + { + outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + } + else + { + outputs[0]->attr.size[0] = inputs[0]->attr.size[0]; + outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; } - outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; - outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; } return TRUE; } /* op_setup() */ @@ -138,12 +172,6 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - if (self->nn_param.resize_internal.lcl_data_ptr) - { - free(self->nn_param.resize_internal.lcl_data_ptr); - self->nn_param.resize_internal.lcl_data_ptr = NULL; - } - vsi_nn_op_common_deinit(self); return VSI_SUCCESS; @@ -157,13 +185,8 @@ static vsi_status op_init { vsi_status status = VSI_SUCCESS; - self->nn_param.resize_internal.lcl_data_ptr = \ - (vsi_nn_resize_in_lcl_data *)malloc(sizeof(vsi_nn_resize_in_lcl_data)); - if (NULL == self->nn_param.resize_internal.lcl_data_ptr) - { - return VX_ERROR_NO_MEMORY; - } - memset(self->nn_param.resize_internal.lcl_data_ptr, 0, sizeof(vsi_nn_resize_in_lcl_data)); + self->nn_param.resize_internal.layout = VSI_NN_RESIZE_LAYOUT_NCHW; + return status; } /* op_init() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c index 7c39210..78c3886 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c @@ -31,6 +31,7 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_log.h" +#include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_constraint_check.h" @@ -170,19 +171,7 @@ static vsi_status op_optimize { size[2] = inputs[1]->attr.size[0]; size[3] = inputs[1]->attr.size[1]; -#ifdef VSI_40BIT_VA_SUPPORT - rois_tmp = vxReshapeTensor(inputs[1]->t, size, dim); -#else - { - vsi_size_t i; - int32_t size_32bit[VSI_NN_MAX_DIM_NUM]; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - size_32bit[i] = (int32_t)size[i]; - } - rois_tmp = vxReshapeTensor(inputs[1]->t, size_32bit, dim); - } -#endif + rois_tmp = vsi_nn_safe_reshape_tensor(inputs[1]->t, (void*)size, (vsi_size_t)dim, sizeof(size[0])); if(NULL == rois_tmp) { return VSI_FAILURE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c index e2897e4..d8c0c8d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c @@ -110,6 +110,7 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP, D_F16) IO_TYPE(D_F16, D_I32, D_F16, D_F16) + IO_TYPE(D_F16, D_I32, D_F16, D_U8|Q_ASYM) IO_TYPE(D_BF16, D_I32, D_BF16, D_BF16) IO_TYPE(D_I32, D_I32, D_I32, D_I32) IO_TYPE(D_U32, D_I32, D_U32, D_U32) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c index 4d8d7fc..ea5373b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c @@ -55,7 +55,7 @@ static vsi_status op_compute vx_nn_reorg_params_ext2_t param; vsi_nn_tensor_t *block_size_tensor = NULL; vsi_nn_tensor_attr_t attr; - uint8_t data = 1; + int32_t data[2] = {1, 1}; memset(¶m, 0, sizeof(vx_nn_reorg_params_ext2_t)); memset(&attr, 0, sizeof(attr)); @@ -66,9 +66,9 @@ static vsi_status op_compute attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; block_size_tensor = vsi_nn_CreateTensorFromData( self->graph, - &data, + (uint8_t *)data, &attr); - if( NULL == block_size_tensor ) + if ( NULL == block_size_tensor ) { VSILOGE("Create block_size_tensor fail.(shufflechannel)"); return VSI_FAILURE; @@ -87,7 +87,7 @@ static vsi_status op_compute sizeof(vx_nn_reorg_params_ext2_t), outputs[0]->t); - if( NULL != self->n ) + if ( NULL != self->n ) { status = VSI_SUCCESS; } @@ -257,4 +257,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c index ff9d84e..09a735a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c @@ -32,7 +32,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" - +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute ( @@ -41,22 +41,16 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_t n; - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LOGISTIC, - 0, - 0, - outputs[0]->t - ); - - if( NULL != self->n ) + n = vsi_nn_kernel_selector( self->graph, "sigmoid", inputs, 1, outputs, 1, NULL ); + if( n == NULL ) { - status = VSI_SUCCESS; + status = VSI_FAILURE; } + self->n = (vx_node)n; + return status; } /* op_compute() */ @@ -93,4 +87,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c index 9d6d9d5..257f1e2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c @@ -37,27 +37,6 @@ #include "utils/vsi_nn_math.h" #include "utils/vsi_nn_constraint_check.h" -static vsi_bool _is_same_shape - ( - vsi_nn_tensor_t * inputs, - vsi_size_t *sizes, - uint32_t dims - ) -{ - uint32_t i = 0; - - if (inputs->attr.dim_num != dims) - return FALSE; - - for (i = 0; i < dims; i++) - { - if (sizes[i] != inputs->attr.size[i]) - return FALSE; - } - - return TRUE; -} - static vsi_status op_compute ( vsi_nn_node_t * self, @@ -128,88 +107,14 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { - vsi_nn_internal_node_t* curr = NULL; - vsi_nn_softmax_param * p; - uint32_t dim_num; - vsi_size_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; - uint32_t i = 0; - int32_t axis = -1; - vsi_nn_tensor_t* new_input = NULL; - vsi_nn_tensor_t* new_output = NULL; - if (VSI_NN_OPTIMIZE_BACKWARD == direction) { return VSI_SUCCESS; } - p = &(self->nn_param.softmax); - axis = p->axis; - if (axis != VSI_NN_SOFTMAX_DEFAULT_AXIS) - { - vsi_size_t innerSize = 1; - vsi_size_t outerSize = 1; - for (i = 0; i < (uint32_t)axis; i++) - { - sizes[i] = inputs[0]->attr.size[i]; - innerSize *= inputs[0]->attr.size[i]; - } - - for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++) - { - outerSize *= inputs[0]->attr.size[i]; - } - - if (axis == 1) - { - if (sizes[0] == 1) - { - sizes[0] = inputs[0]->attr.size[axis]; - sizes[1] = outerSize; - - dim_num = 2; - } - else - { - sizes[axis] = 1; - sizes[axis + 1] = inputs[0]->attr.size[axis]; - sizes[axis + 2] = outerSize; - - dim_num = 4; - } - } - else if (axis >= 3) - { - sizes[0] = innerSize; - sizes[1] = 1; - sizes[2] = inputs[0]->attr.size[axis]; - sizes[3] = outerSize; - - dim_num = vsi_nn_min(4, inputs[0]->attr.dim_num); - } - else - { - sizes[axis] = inputs[0]->attr.size[axis]; - sizes[axis + 1] = outerSize; - - dim_num = vsi_nn_min((uint32_t)(axis + 2), inputs[0]->attr.dim_num); - } - } - - if (axis != VSI_NN_SOFTMAX_DEFAULT_AXIS && _is_same_shape(inputs[0], sizes, dim_num) == FALSE) - { - new_input = vsi_nn_reshape_tensor(self->graph, inputs[0], sizes, dim_num); - new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], sizes, dim_num); - curr = ((vsi_nn_internal_node_wksp_t *)((self)->internal_node_wksp))->nodes; - curr->inputs[0] = new_input; - curr->outputs[0] = new_output; - p->local.reshaped_input = new_input; - p->local.reshaped_output = new_output; - } - return vsi_nn_internal_optimize_node( self, direction ); } /* op_optimize() */ - static vsi_status op_deinit ( vsi_nn_node_t * self @@ -237,10 +142,6 @@ static vsi_status op_init { vsi_status status = VSI_SUCCESS; - if (vsi_nn_compareVersion(self->graph, 1, 1, 7) == -1) - { - self->nn_param.softmax.axis = VSI_NN_SOFTMAX_DEFAULT_AXIS; - } if (self->nn_param.softmax.beta == 0.f) { self->nn_param.softmax.beta = 1.f; @@ -262,6 +163,18 @@ static vsi_bool op_setup return FALSE; } + if (vsi_nn_compareVersion(self->graph, 1, 1, 7) == -1) + { + if (inputs[0]->attr.dim_num < 3) + { + self->nn_param.softmax.axis = 0; + } + else + { + self->nn_param.softmax.axis = 2; + } + } + if (self->nn_param.softmax.axis < 0) self->nn_param.softmax.axis += (int32_t)inputs[0]->attr.dim_num; @@ -276,6 +189,7 @@ static vsi_bool op_setup curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; curr->node->nn_param.softmax_internal.beta = self->nn_param.softmax.beta; + curr->node->nn_param.softmax_internal.axis = self->nn_param.softmax.axis; vsi_nn_internal_setup_node(self, curr); return TRUE; @@ -300,4 +214,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c index ce0b2e4..7d0824d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c @@ -180,21 +180,23 @@ static vsi_status op_optimize in_view_tensor = NULL; out_view_tensor = NULL; status = VSI_SUCCESS; - if(direction == VSI_NN_OPTIMIZE_BACKWARD) + if (direction == VSI_NN_OPTIMIZE_BACKWARD) { return status; } - if(_need_split_softmax(self, inputs) == FALSE) + if ( _need_split_softmax(self, inputs) == FALSE || + self->nn_param.softmax_internal.axis != 0 || + self->graph->ctx->config.support_stream_processor ) { return status; } VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); - if( NULL == inputs[0]->t ) + if ( NULL == inputs[0]->t ) { vsi_nn_TensorReinit( self->graph, inputs[0] ); } - if( NULL == outputs[0]->t ) + if ( NULL == outputs[0]->t ) { vsi_nn_TensorReinit( self->graph, outputs[0] ); } @@ -208,11 +210,11 @@ static vsi_status op_optimize end[2] = inputs[0]->attr.size[2]; end[3] = inputs[0]->attr.size[3]; end[axis] = 0; - while(end[axis] < batch_size) + while (end[axis] < batch_size) { start[axis] = end[axis]; end[axis] += MAX_SOFTMAX_BATCH; - if(end[axis] > inputs[0]->attr.size[axis]) + if (end[axis] > inputs[0]->attr.size[axis]) { end[axis] = inputs[0]->attr.size[axis]; } @@ -224,14 +226,14 @@ static vsi_status op_optimize break; } out_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, outputs[0]); - if(NULL == out_view_tensor) + if (NULL == out_view_tensor) { VSILOGE( "Create outputs view tensor fail."); break; } status = _create_split_softmax(self, in_view_tensor, out_view_tensor); - if(VSI_SUCCESS != status) + if (VSI_SUCCESS != status) { VSILOGE( "Create split softmax data struct fail."); break; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c index 1e0144c..86d46dd 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c @@ -31,9 +31,8 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" - +#include "utils/vsi_nn_util.h" static vsi_status op_compute ( @@ -42,14 +41,13 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_node_t n; - self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "square", - inputs, 1, outputs, 1, NULL ); - - if( self->n ) + n = vsi_nn_kernel_selector( self->graph, "square", inputs, 1, outputs, 1, NULL ); + if ( n == NULL ) { - status = VSI_SUCCESS; + status = VSI_FAILURE; } return status; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c index fb4bcf7..250f4f3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c @@ -143,9 +143,9 @@ static vsi_bool op_setup } vsi_nn_internal_init_node_wksp( self ); - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); - curr->node->nn_param.reshape.size = outputs[0]->attr.size; - curr->node->nn_param.reshape.dim_num = outputs[0]->attr.dim_num; + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + curr->node->nn_param.reshape2.size = outputs[0]->attr.size; + curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; vsi_nn_internal_setup_node( self, curr ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index f743a52..c0a0562 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -255,11 +255,8 @@ static vsi_status copy_tensor_to_view data->src_tensor = src_tensor; if (dst_in->t) { -#ifdef VSI_40BIT_VA_SUPPORT - data->dst_tensor = vxReshapeTensor(dst_in->t, dst_in->attr.size, dst_in->attr.dim_num); -#else - data->dst_tensor = vxReshapeTensor(dst_in->t, (int32_t*)dst_in->attr.size, dst_in->attr.dim_num); -#endif + data->dst_tensor = vsi_nn_safe_reshape_tensor(dst_in->t, (void*)dst_in->attr.size, + (vsi_size_t)dst_in->attr.dim_num, sizeof(dst_in->attr.size[0])); } data->is_dataconvert_op = TRUE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c index 850a2d1..a953651 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c @@ -32,7 +32,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" - +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute ( @@ -41,22 +41,23 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - status = VSI_FAILURE; + vsi_status status = VSI_SUCCESS; + vsi_nn_kernel_param_t * param; + vsi_nn_kernel_node_t n; + param = vsi_nn_kernel_param_create(); - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HYPERBOLIC_TAN, - self->nn_param.tanh.scale_a, - self->nn_param.tanh.scale_b, - outputs[0]->t - ); + vsi_nn_kernel_param_add_float32( param, "scale_a", self->nn_param.tanh.scale_a ); + vsi_nn_kernel_param_add_float32( param, "scale_b", self->nn_param.tanh.scale_b ); - if( NULL != self->n ) + n = vsi_nn_kernel_selector( self->graph, "tanh", inputs, 1, outputs, 1, param ); + if( n == NULL ) { - status = VSI_SUCCESS; + vsi_nn_kernel_param_release( ¶m ); + status = VSI_FAILURE; } + self->n = (vx_node)n; + vsi_nn_kernel_param_release( ¶m ); + return status; } /* op_compute() */ @@ -93,4 +94,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c index f62ac51..676326b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c @@ -168,15 +168,15 @@ static vsi_bool op_setup vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); reshape_input_size[0] = block_size; reshape_input_size[1] = tensor_num; reshape_input_size[2] = block_num; - curr->node->nn_param.reshape.size = reshape_input_size; - curr->node->nn_param.reshape.dim_num = 3; + curr->node->nn_param.reshape2.size = reshape_input_size; + curr->node->nn_param.reshape2.dim_num = 3; curr->inputs[0] = inputs[0]; curr->outputs[0] = input_tensor->t; vsi_nn_internal_setup_node( self, curr ); @@ -208,9 +208,9 @@ static vsi_bool op_setup memcpy(output_size, outputs[i]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); - curr->node->nn_param.reshape.size = output_size; - curr->node->nn_param.reshape.dim_num = outputs[i]->attr.dim_num; + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + curr->node->nn_param.reshape2.size = output_size; + curr->node->nn_param.reshape2.dim_num = outputs[i]->attr.dim_num; curr->inputs[0] = output_tensors[i]->t; curr->outputs[0] = outputs[i]; vsi_nn_internal_setup_node( self, curr ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c index 8879471..d213bb9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c @@ -101,14 +101,11 @@ static vsi_status op_optimize vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype)) { VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); -#ifdef VSI_40BIT_VA_SUPPORT - outputs[0]->t = vxReshapeTensor(inputs[0]->t, outputs[0]->attr.size, outputs[0]->attr.dim_num); -#else - outputs[0]->t = vxReshapeTensor(inputs[0]->t, (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num); -#endif + outputs[0]->t = vsi_nn_safe_reshape_tensor(inputs[0]->t, (void*)outputs[0]->attr.size, + (vsi_size_t)outputs[0]->attr.dim_num, sizeof(outputs[0]->attr.size[0])); if( NULL == outputs[0]->t ) { - VSILOGE("Call vxReshapeTensor fail"); + VSILOGE("Call vsi_nn_safe_reshape_tensor fail"); free(local); local = NULL; return VSI_FAILURE; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index 2e043b6..9f8ca77 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -111,7 +111,7 @@ static void _try_pack_tensor_data cnt = fwrite( data, (size_t)bytes, 1, s_dfile_hndl ); if( cnt != 1 ) { - VSILOGW( "Write tensor bytes(%"VSI_SIZE_T_SPECIFIER"/%d)", (vsi_size_t)cnt, 1 ); + VSILOGW( "Write tensor bytes(%"SIZE_T_SPECIFIER"/%d)", cnt, 1 ); } if( cnt > 0 ) { @@ -435,6 +435,8 @@ static _op_param_gen_t s_op_gen[] = /* GRU */ NULL, /* GRUCELL */ NULL, /* GRUCELL_ACTIVATION */ NULL, + /* RESHAPE2 */ NULL, + /* CONV3D */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c index 384981b..acca854 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c @@ -156,6 +156,8 @@ static inline void _convert_float_to_bfloat16 return TRUE; \ } +DEF_DTYPE_CONVERT_QUANTIZE( asymmi4, int8_t, vsi_rtne, -8, 7 ) +DEF_DTYPE_CONVERT_QUANTIZE( asymm4, uint8_t, vsi_rtne, 0, 0xF ) DEF_DTYPE_CONVERT_QUANTIZE( symm8, int8_t, vsi_rtne, SCHAR_MIN, SCHAR_MAX ) DEF_DTYPE_CONVERT_QUANTIZE( symm16, int16_t, vsi_rtne, SHRT_MIN, SHRT_MAX ) DEF_DTYPE_CONVERT_QUANTIZE( symm32, int32_t, vsi_rtne, INT_MIN, INT_MAX ) @@ -256,6 +258,12 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm { switch( dtype ) { + case I4: + return vsi_nn_dtype_convert_float_to_quantize_asymmi4( + buffer, size, scale, zero_point, (int8_t*)out_buffer ); + case U4: + return vsi_nn_dtype_convert_float_to_quantize_asymm4( + buffer, size, scale, zero_point, (uint8_t*)out_buffer ); case U8: return vsi_nn_dtype_convert_float_to_quantize_asymm8( buffer, size, scale, zero_point, (uint8_t*)out_buffer ); @@ -396,6 +404,12 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float { switch( dtype ) { + case I4: + return vsi_nn_dtype_convert_quantize_asymmi4_to_float( + (const int8_t *)buffer, size, scale, zero_point, out_buffer ); + case U4: + return vsi_nn_dtype_convert_quantize_asymm4_to_float( + (const uint8_t *)buffer, size, scale, zero_point, out_buffer ); case U8: return vsi_nn_dtype_convert_quantize_asymm8_to_float( (const uint8_t *)buffer, size, scale, zero_point, out_buffer ); @@ -481,4 +495,3 @@ vsi_bool vsi_nn_dtype_convert_quantize_symm_perchannel_to_float } return TRUE; } /* vsi_nn_dtype_convert_quantize_symm_perchannel_to_float() */ - diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c index 6144845..3c45846 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c @@ -329,6 +329,23 @@ uint32_t vsi_nn_TypeGetBytes return type_get_bytes( type ); } /* vsi_nn_TypeGetBytes() */ +uint32_t vsi_nn_TypeGetBytesExt + ( + const vsi_nn_type_e type + ) +{ + uint32_t bits_num = 0; + bits_num = vsi_nn_TypeGetBits(type); + if(bits_num < BITS_PER_BYTE) + { + return 1; + } + else + { + return bits_num / BITS_PER_BYTE; + } +} + /* * Deprecated: use vsi_nn_TypeGetBytes() insteatd. */ @@ -340,6 +357,14 @@ uint32_t vsi_nn_GetTypeBytes return type_get_bytes( type ); } /* vsi_nn_GetTypeBytes() */ +uint32_t vsi_nn_TypeGetBits + ( + const vsi_nn_type_e type + ) +{ + return type_get_bits(type); +} /* vsi_nn_GetTypeBits() */ + vsi_bool vsi_nn_QuantCheck ( vsi_nn_tensor_t *input, @@ -386,6 +411,7 @@ vsi_bool vsi_nn_QuantCheck bias->attr.dtype.fl); } break; + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) { @@ -437,7 +463,8 @@ vsi_bool vsi_nn_DtypeCompare return FALSE; } } - else if(dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + else if( dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC || + dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) { const float diff = (float)1e-5; if(dtype0->zero_point != dtype1->zero_point) diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index 4b6fded..b05fdab 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -188,7 +188,6 @@ vsi_size_t vsi_nn_GetStrideSize vsi_size_t * stride ) { - if( NULL == attr || NULL == stride ) { return 0; @@ -207,20 +206,45 @@ vsi_size_t vsi_nn_GetStrideSizeBySize { vsi_size_t total_bytes; vsi_size_t i; + vsi_size_t type_bits; if( NULL == size || NULL == stride ) { return 0; } - - stride[0] = vsi_nn_GetTypeBytes( type ); + type_bits = vsi_nn_TypeGetBits( type); + stride[0] = type_bits / BITS_PER_BYTE; total_bytes = stride[0]; - for( i = 1; i < dim_num; i ++ ) + if( type_bits < BITS_PER_BYTE ) { - stride[i] = size[i - 1] * stride[i - 1]; - total_bytes *= size[i]; + total_bytes = 1; + if( size[0] % (BITS_PER_BYTE / type_bits) == 0 ) + { + stride[1] = size[0] * type_bits / BITS_PER_BYTE; + } + else + { + stride[1] = size[0] * type_bits / BITS_PER_BYTE + 1; + } + + total_bytes *= stride[1]; + for(i = 2; i < dim_num; i++) + { + stride[i] = size[i-1] * stride[i-1]; + total_bytes *= size[i]; + } + total_bytes *= size[1]; } - total_bytes *= size[0]; + else + { + for( i = 1; i < dim_num; i ++ ) + { + stride[i] = size[i - 1] * stride[i - 1]; + total_bytes *= size[i]; + } + total_bytes *= size[0]; + } + for( i = dim_num; i < VSI_NN_MAX_DIM_NUM; i ++ ) { stride[i] = total_bytes; @@ -254,6 +278,8 @@ float vsi_nn_DataAsFloat32 case VSI_NN_TYPE_BOOL8: val = (float)((int8_t*)data)[0]; break; + case VSI_NN_TYPE_INT4: + case VSI_NN_TYPE_INT8: val = (float)((int8_t*)data)[0]; break; @@ -327,7 +353,6 @@ void vsi_nn_UpdateTensorDims } } /* vsi_nn_UpdateTensorDims() */ - vsi_size_t vsi_nn_ComputeFilterSize ( vsi_size_t i_size, @@ -380,6 +405,26 @@ vsi_size_t vsi_nn_compute_filter_shape } } /* vsi_nn_compute_filter_shape() */ +void vsi_nn_compute_padding_per_axis + ( + vsi_size_t in_shape, + vsi_size_t ksize, + uint32_t stride, + uint32_t dilation, + vsi_nn_pad_e pad_type, + vsi_size_t out_pad[2] + ) +{ + vsi_size_t out_size; + vsi_size_t total_pads; + if(dilation == 0) dilation = 1; + out_size = vsi_nn_compute_filter_shape(pad_type, in_shape, ksize, stride, dilation); + total_pads = _compute_padding(in_shape, ksize, stride, dilation, out_size); + + out_pad[0] = total_pads / 2; + out_pad[1] = total_pads - out_pad[0]; +} + void vsi_nn_compute_padding ( vsi_size_t * in_shape, @@ -390,8 +435,6 @@ void vsi_nn_compute_padding vsi_size_t * out_pad ) { - vsi_size_t out_w, out_h; - vsi_size_t pad_w, pad_h; uint32_t dilation_w, dilation_h; if (NULL == in_shape || NULL == ksize || NULL == stride || NULL == out_pad) @@ -413,16 +456,48 @@ void vsi_nn_compute_padding dilation_h = dilation[1]; } - out_w = vsi_nn_compute_filter_shape(pad_type, in_shape[0], ksize[0], stride[0], dilation_w); - out_h = vsi_nn_compute_filter_shape(pad_type, in_shape[1], ksize[1], stride[1], dilation_h); - pad_w = _compute_padding(in_shape[0], ksize[0], stride[0], dilation_w, out_w); - pad_h = _compute_padding(in_shape[1], ksize[1], stride[1], dilation_h, out_h); - out_pad[0] = pad_w / 2; - out_pad[1] = pad_w - out_pad[0]; - out_pad[2] = pad_h / 2; - out_pad[3] = pad_h - out_pad[2]; + vsi_nn_compute_padding_per_axis(in_shape[0], ksize[0], stride[0], dilation_w, pad_type, out_pad); + vsi_nn_compute_padding_per_axis(in_shape[1], ksize[1], stride[1], dilation_h, pad_type, out_pad + 2); } /* vsi_nn_compute_padding() */ +void vsi_nn_compute_padding_3d + ( + const vsi_size_t in_shape[3], + const vsi_size_t ksize[3], + const uint32_t stride[3], + const uint32_t dilation[3], + const vsi_nn_pad_e pad_type, + vsi_size_t out_pad[6] + ) +{ + uint32_t dilation_w, dilation_h, dilation_d; + if (NULL == in_shape || NULL == ksize + || NULL == stride || NULL == out_pad) + { + return; + } + if (pad_type == VSI_NN_PAD_AUTO) + { + return; + } + if (NULL == dilation || (dilation[0] == 0 && dilation[1] == 0 && dilation[2] == 0)) + { + dilation_w = 1; + dilation_h = 1; + dilation_d = 1; + } + else + { + dilation_w = dilation[0]; + dilation_h = dilation[1]; + dilation_d = dilation[2]; + } + + vsi_nn_compute_padding_per_axis(in_shape[0], ksize[0], stride[0], dilation_w, pad_type, out_pad); + vsi_nn_compute_padding_per_axis(in_shape[1], ksize[1], stride[1], dilation_h, pad_type, out_pad + 2); + vsi_nn_compute_padding_per_axis(in_shape[2], ksize[2], stride[2], dilation_d, pad_type, out_pad + 4); +} + void vsi_nn_ComputePadWithPadType ( vsi_size_t * in_shape, @@ -792,10 +867,12 @@ void vsi_nn_FormatToString { switch(tensor->attr.dtype.vx_type) { + case VSI_NN_TYPE_INT4:strncpy(buf, "i4 ", buf_sz);break; case VSI_NN_TYPE_INT8:strncpy(buf, "i8 ", buf_sz);break; case VSI_NN_TYPE_INT16:strncpy(buf, "i16", buf_sz);break; case VSI_NN_TYPE_INT32:strncpy(buf, "i32", buf_sz);break; case VSI_NN_TYPE_INT64:strncpy(buf, "i64", buf_sz);break; + case VSI_NN_TYPE_UINT4:strncpy(buf, "u4 ", buf_sz);break; case VSI_NN_TYPE_UINT8:strncpy(buf, "u8 ", buf_sz);break; case VSI_NN_TYPE_UINT16:strncpy(buf, "u16", buf_sz);break; case VSI_NN_TYPE_UINT32:strncpy(buf, "u32", buf_sz);break; @@ -1003,7 +1080,7 @@ vsi_bool vsi_nn_is_same_quant_type( result = TRUE; } break; - + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: if (src->attr.dtype.scale == dst->attr.dtype.scale && src->attr.dtype.zero_point == dst->attr.dtype.zero_point) @@ -1050,3 +1127,220 @@ vsi_bool vsi_nn_is_same_type { return (vsi_nn_is_same_data_type(src, dst) && vsi_nn_is_same_quant_type(src, dst)); } + +vsi_bool vsi_nn_is_broadcast_operaton + ( + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t * output + ) +{ + vsi_size_t out_rank = output->attr.dim_num; + vsi_size_t i = 0; + + for (i = 0; i < out_rank; i++) + { + size_t j = 0; + vsi_size_t dst_size = output->attr.size[i]; + + for (j = 0; j < input_num; j++) + { + vsi_size_t src_size = i < inputs[j]->attr.dim_num ? inputs[j]->attr.size[i] : 1; + + if (dst_size != src_size) + { + return TRUE; + } + } + } + return FALSE; +} + +float vsi_nn_get_tensor_scale + ( + vsi_nn_tensor_t * tensor + ) +{ + float scale = 1.0f; + + switch (tensor->attr.dtype.qnt_type) + { + case VSI_NN_QNT_TYPE_DFP: + { + int8_t fl = tensor->attr.dtype.fl; + if (fl >= 0) + { + scale = 1.0f / ( (float) ( (int64_t)1 << fl )); + } + else + { + scale = (float) ( (int64_t)1 << -fl ); + } + } + break; + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + scale = tensor->attr.dtype.scale; + break; + default: + break; + } + + return scale; +} + +int32_t vsi_nn_get_tensor_zero_point + ( + vsi_nn_tensor_t * tensor + ) +{ + int32_t zero_point = 0; + + switch (tensor->attr.dtype.qnt_type) + { + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + zero_point = tensor->attr.dtype.zero_point; + break; + default: + break; + } + + return zero_point; +} + +void vsi_nn_get_tensor_clamp_min_max + ( + vsi_nn_tensor_t * input, + float *clampMin, + float *clampMax + ) +{ + float zero_point = (float)vsi_nn_get_tensor_zero_point(input); + vsi_nn_type_e vx_type = input->attr.dtype.vx_type; + + if (vx_type == VSI_NN_TYPE_UINT8) + { + *clampMin = - zero_point; + *clampMax = 255 - zero_point; + } + else if (vx_type == VSI_NN_TYPE_INT8) + { + *clampMin = -128 - zero_point; + *clampMax = 127 - zero_point; + } + else if (vx_type == VSI_NN_TYPE_INT16) + { + *clampMin = -32768 - zero_point; + *clampMax = 32767 - zero_point; + } + else if (vx_type == VSI_NN_TYPE_UINT16) + { + *clampMin = - zero_point; + *clampMax = 65535 - zero_point; + } + else + { + uint32_t f32_min = 0xff800000; + uint32_t f32_max = 0x7f800000; + + *clampMin = *(float*)&f32_min; + *clampMax = *(float*)&f32_max; + } +} + +vsi_status vsi_nn_Pack4bitData + ( + vsi_nn_tensor_t * tensor, + uint8_t * src, + uint8_t * dest + ) +{ + vsi_status status; + uint32_t i = 0, j = 0; + uint8_t high = 0, low = 0; + vsi_size_t src_size; + + status = VSI_SUCCESS; + src_size = vsi_nn_GetElementNum( tensor ); + for( i = 0; i < src_size; i++ ) + { + if( (i+1) % tensor->attr.size[0] == 0) + { + high = 0; + low = src[i]; + } + else + { + high = src[i+1]; + low = src[i]; + i++; + } + dest[j] = (high << 4) | (low & 0xF); + j++; + } + return status; +} /* vsi_nn_Pack4bitData() */ + +vsi_status vsi_nn_Unpack4bitData + ( + vsi_nn_tensor_t * tensor, + uint8_t * src, + uint8_t * dest, + vsi_nn_type_e type + ) +{ + vsi_status status; + uint32_t i = 0, j = 0; + uint8_t high = 0, low = 0; + vsi_size_t stride[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t src_size; + + status = VSI_SUCCESS; + src_size = vsi_nn_GetStrideSize(&tensor->attr, stride); + for( i = 0 ; i < src_size; i++) + { + high = src[i] >> 4; + low = src[i] & 0x0F; + if( type == VSI_NN_TYPE_INT4 ) + { + if( high > 7) + { + high = high | 0xF0; + } + if( low > 7) + { + low = low | 0xF0; + } + } + if( tensor->attr.size[0] % stride[1] == 0 ) + { + if( tensor->attr.size[0] == 1 ) + { + dest[j] = low; + j++; + } + else + { + dest[j] = low; + dest[j+1] = high; + j += 2; + } + } + else + { + if( (i+1) % stride[1] == 0 ) + { + dest[j] = low; + j++; + } + else + { + dest[j] = low; + dest[j+1] = high; + j += 2; + } + } + } + return status; +} /* vsi_nn_Unpack4bitData() */ diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index eb7f494..f453b32 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -34,7 +34,9 @@ static vsi_status query_hardware_caps { vsi_status status = VSI_FAILURE; vx_hardware_caps_params_t param; - +#if VX_STREAM_PROCESSOR_SUPPORT + vx_hardware_caps_params_ext2_t paramExt2; +#endif #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT vx_hardware_caps_params_ext_t paramExt; @@ -51,9 +53,16 @@ static vsi_status query_hardware_caps #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT context->config.subGroupSize = paramExt.subGroupSize; -#if VX_VA40_EXT_SUPPORT +#ifdef VSI_40BIT_VA_SUPPORT context->config.use_40bits_va = paramExt.supportVA40; #endif +#if VX_STREAM_PROCESSOR_SUPPORT + memset(¶mExt2, 0, sizeof(vx_hardware_caps_params_ext2_t)); + status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(¶mExt2), + sizeof(vx_hardware_caps_params_ext2_t)); + context->config.support_stream_processor = paramExt.supportStreamProcessor; + context->config.sp_exec_count = paramExt2.streamProcessorExecCount; +#endif #endif diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c index c1551a6..fb17d8b 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -521,10 +521,27 @@ static vx_tensor _create_const_raw_tensor vx_tensor_create_params_t params; float * scales = NULL; int32_t * zeroPoints = NULL; + vx_size size_vxsize[VSI_NN_MAX_DIM_NUM] = {0}; + vx_uint32 size_u32[VSI_NN_MAX_DIM_NUM] = {0}; + size_t i = 0; memset( ¶ms, 0, sizeof( vx_tensor_create_params_t ) ); params.num_of_dims = attr.dim_num; - params.sizes = attr.size; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_vxsize[i] = -1 == attr.size[i] ? -1 : (vx_size)attr.size[i]; + } + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_u32[i] = -1 == attr.size[i] ? -1 : (vx_uint32)attr.size[i]; + } +#ifdef VSI_40BIT_VA_SUPPORT + params.sizes = size_vxsize; + (void)size_u32; +#else + params.sizes = size_u32; + (void)size_vxsize; +#endif params.data_format = (vsi_enum)attr.dtype.vx_type; params.quant_format = (vsi_enum)attr.dtype.qnt_type; switch( attr.dtype.qnt_type ) @@ -593,20 +610,31 @@ static vx_tensor _create_const_raw_tensor if( data ) { #ifdef VSI_40BIT_VA_SUPPORT - addr = vxCreateTensorAddressing(graph->ctx->c, - attr.size, stride_size, (vsi_size_t)attr.dim_num); + { + vx_size size[_cnt_of_array(attr.size)] = {0}; + vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0}; + for(i = 0; i < _cnt_of_array(attr.size); i++) + { + size[i] = -1 == attr.size[i] ? -1 : (vx_size)attr.size[i]; + } + for(i = 0; i < _cnt_of_array(stride_size); i++) + { + stride_size[i] = (vx_size)stride_size[i]; + } + addr = vxCreateTensorAddressing(graph->ctx->c, + size, stride_size_vxsize, (vx_size)attr.dim_num); + } #else { - vsi_size_t i; uint32_t size_32bit[_cnt_of_array(attr.size)] = {0}; uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0}; for(i = 0; i < _cnt_of_array(attr.size); i++) { - size_32bit[i] = (uint32_t)attr.size[i]; + size_32bit[i] = -1 == attr.size[i] ? -1 : (uint32_t)attr.size[i]; } for(i = 0; i < _cnt_of_array(stride_size); i++) { - stride_size_32bit[i] = (uint32_t)stride_size[i]; + stride_size_32bit[i] = -1 == stride_size[i] ? -1 : (uint32_t)stride_size[i]; } addr = vxCreateTensorAddressing(graph->ctx->c, size_32bit, stride_size_32bit, (vx_uint8)attr.dim_num); diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c index 50b9d62..4962dbc 100644 --- a/src/tim/vx/internal/src/vsi_nn_internal_node.c +++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c @@ -213,6 +213,7 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor switch(input_attr->dtype.qnt_type) { + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: scale = input_attr->dtype.scale; break; @@ -233,10 +234,11 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor switch(weight_attr->dtype.qnt_type) { + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: attr.dtype.scale = weight_attr->dtype.scale * scale; attr.dtype.zero_point = 0; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC; + attr.dtype.qnt_type = weight_attr->dtype.qnt_type; break; case VSI_NN_QNT_TYPE_DFP: diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index fe05e1e..06dd052 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -118,8 +118,7 @@ static void _set_preproc_node_rect_params ( vsi_nn_node_t* node, vsi_nn_preprocess_crop_t* crop, - vsi_nn_tensor_attr_t* attr, - vsi_nn_preprocess_source_layout_e* source_layout + vsi_nn_preprocess_image_size_t* input_size ) { if(crop != NULL) @@ -133,13 +132,8 @@ static void _set_preproc_node_rect_params { node->nn_param.pre_process.rect.left = 0; node->nn_param.pre_process.rect.top = 0; - node->nn_param.pre_process.rect.width = (uint32_t)attr->size[0]; - node->nn_param.pre_process.rect.height = (uint32_t)attr->size[1]; - if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) - { - node->nn_param.pre_process.rect.width = (uint32_t)attr->size[1]; - node->nn_param.pre_process.rect.height = (uint32_t)attr->size[2]; - } + node->nn_param.pre_process.rect.width = input_size->w; + node->nn_param.pre_process.rect.height = input_size->h; } } /* _set_preproc_node_rect_params() */ @@ -496,7 +490,7 @@ vsi_status vsi_nn_add_single_preproc_node status = _set_preproc_node_type(node, source_format); TEST_CHECK_STATUS(status, final); - _set_preproc_node_rect_params(node, crop, &org_norm_tensor->attr, source_layout); + _set_preproc_node_rect_params(node, crop, input_size); _set_preproc_node_norm_params(node, mean_and_scale, &org_norm_tensor->attr); if(permute != NULL) diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c index 9ba72ad..8fa073c 100644 --- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c +++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c @@ -84,7 +84,6 @@ vsi_bool vsi_nn_rnn_find_best_kernel_size } kernel_w = 1; } - } VSILOGD("Use kernel_h: %d, kernel_w: %d to convert FC", kernel_h, kernel_w); @@ -122,7 +121,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); - tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 ); reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t)); reshape_in_size[3] = input->attr.size[1]; @@ -130,8 +129,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc reshape_in_size[1] = kernel_h; reshape_in_size[0] = kernel_w; - tmp_inode->node->nn_param.reshape.size = reshape_in_size; - tmp_inode->node->nn_param.reshape.dim_num = 4; + tmp_inode->node->nn_param.reshape2.size = reshape_in_size; + tmp_inode->node->nn_param.reshape2.dim_num = 4; tmp_inode->inputs[0] = input; tmp_inode->outputs[0] = tensor1->t; vsi_nn_internal_setup_node(self, tmp_inode); @@ -231,14 +230,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc } tensor2 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); - tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 ); reshape_in_size = (vsi_size_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t)); reshape_in_size[1] = tensor->attr.size[3]; reshape_in_size[0] = tensor->attr.size[2]; - tmp_inode->node->nn_param.reshape.size = reshape_in_size; - tmp_inode->node->nn_param.reshape.dim_num = 2; + tmp_inode->node->nn_param.reshape2.size = reshape_in_size; + tmp_inode->node->nn_param.reshape2.dim_num = 2; tmp_inode->inputs[0] = tensor; tmp_inode->outputs[0] = tensor2->t; vsi_nn_internal_setup_node(self, tmp_inode); @@ -303,14 +302,14 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 tensor = tensor0->t; } - tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); + tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 ); reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t)); reshape_in_size[1] = tensor->attr.size[3]; reshape_in_size[0] = tensor->attr.size[2]; - tmp_inode->node->nn_param.reshape.size = reshape_in_size; - tmp_inode->node->nn_param.reshape.dim_num = 2; + tmp_inode->node->nn_param.reshape2.size = reshape_in_size; + tmp_inode->node->nn_param.reshape2.dim_num = 2; tmp_inode->inputs[0] = tensor; tmp_inode->outputs[0] = output; vsi_nn_internal_setup_node(self, tmp_inode); @@ -694,7 +693,7 @@ void vsi_nn_rnn_data_check_aligned vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size, input[i]->attr.dim_num, input[i]->attr.dtype.vx_type ); - if( ofst & 0x3f ) + if( ofst & 0x3f && !self->graph->ctx->config.support_stream_processor) { vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); @@ -729,14 +728,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); reshape_split_size = (vsi_size_t *)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); reshape_split_size[0] = -1; reshape_split_size[1] = batch_size; - curr->node->nn_param.reshape.size = reshape_split_size; - curr->node->nn_param.reshape.dim_num = 2; + curr->node->nn_param.reshape2.size = reshape_split_size; + curr->node->nn_param.reshape2.dim_num = 2; curr->inputs[0] = input; curr->outputs[0] = output_tensor->t; vsi_nn_internal_setup_node( self, curr ); @@ -763,15 +762,15 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor); output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); reshape_grucell_output_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); reshape_grucell_output_size[0] = -1; reshape_grucell_output_size[1] = batch_size; reshape_grucell_output_size[2] = 1; - curr->node->nn_param.reshape.size = reshape_grucell_output_size; - curr->node->nn_param.reshape.dim_num = 3; + curr->node->nn_param.reshape2.size = reshape_grucell_output_size; + curr->node->nn_param.reshape2.dim_num = 3; curr->inputs[0] = input; curr->outputs[0] = output_tensor->t; vsi_nn_internal_setup_node( self, curr ); @@ -918,16 +917,15 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape vsi_bool use_virtual_tensor ) { - vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_tensor_t* tensor0 = NULL; vsi_size_t* reshape_in_size = NULL; - curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 ); reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(vsi_size_t)); memcpy(reshape_in_size, size, dim_num * sizeof(vsi_size_t)); - curr->node->nn_param.reshape.size = reshape_in_size; - curr->node->nn_param.reshape.dim_num = (uint32_t)dim_num; + curr->node->nn_param.reshape2.size = reshape_in_size; + curr->node->nn_param.reshape2.dim_num = (uint32_t)dim_num; curr->inputs[0] = input_tensor; curr->outputs[0] = output_tensor; diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index 3f662b6..e82e537 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -25,6 +25,7 @@ #include #include +#include "vsi_nn_platform.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" #include "vsi_nn_graph.h" @@ -74,11 +75,11 @@ static vsi_size_t get_tensor_elements_num { vsi_size_t num; vsi_size_t sz; - uint32_t dsize; + vsi_size_t dsize; sz = vsi_nn_GetTensorSize( shape, dim_num, type ); - dsize = vsi_nn_GetTypeBytes( type ); + dsize = vsi_nn_TypeGetBytesExt( type ); num = sz / dsize; return num; } /* get_tensor_elements_num() */ @@ -128,6 +129,14 @@ static void print_tensor tensor->attr.dtype.channel_dim, tensor->attr.dtype.scale_dim ); ext_attr[count] = 0; break; + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC: + count = snprintf(&ext_attr[0], + _EXT_ATTR_BUF_SZ, + "ASYM PERCHANNEL axis=%d, count=%d", + tensor->attr.dtype.channel_dim, + tensor->attr.dtype.scale_dim); + ext_attr[count] = 0; + break; #endif default: strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ); @@ -308,25 +317,50 @@ static vsi_bool _init_tensor vsi_bool ret; vx_tensor_create_params_t params; float * scales = NULL; + int32_t * zeroPoints = NULL; int32_t * null_zp = NULL; + vx_size size_vxsize[VSI_NN_MAX_DIM_NUM] = {0}; + vx_uint32 size_u32[VSI_NN_MAX_DIM_NUM] = {0}; + size_t i = 0; ret = TRUE; memset( ¶ms, 0, sizeof( vx_tensor_create_params_t ) ); params.num_of_dims = tensor->attr.dim_num; - params.sizes = tensor->attr.size; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i]; + } + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_u32[i] = -1 == tensor->attr.size[i] ? -1 : (vx_uint32)tensor->attr.size[i]; + } +#ifdef VSI_40BIT_VA_SUPPORT + params.sizes = size_vxsize; + (void)size_u32; +#else + params.sizes = size_u32; + (void)size_vxsize; +#endif params.data_format = (vsi_enum)tensor->attr.dtype.vx_type; - params.quant_format = (vsi_enum)tensor->attr.dtype.qnt_type; switch( tensor->attr.dtype.qnt_type ) { case VSI_NN_QNT_TYPE_DFP: + params.quant_format = (vsi_enum)VX_QUANT_DYNAMIC_FIXED_POINT; params.quant_data.dfp.fixed_point_pos = (uint8_t)tensor->attr.dtype.fl; break; + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: + params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE; params.quant_data.affine.scale = tensor->attr.dtype.scale; params.quant_data.affine.zeroPoint = (int32_t)tensor->attr.dtype.zero_point; break; case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT + #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL + params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL; + #else + params.quant_format = (vsi_enum)VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC; + #endif // This is a hack that driver doesn't support const scales scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim); memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float)); @@ -345,6 +379,35 @@ static vsi_bool _init_tensor break; #else VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC." ); +#endif + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC: +#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT + #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL + params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL; + #else + params.quant_format = (vsi_enum)VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC; + #endif + // This is a hack that driver doesn't support const scales + scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim); + memcpy(scales, + tensor->attr.dtype.scales, + tensor->attr.dtype.scale_dim * sizeof(float)); + zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim); + memcpy(zeroPoints, + tensor->attr.dtype.zero_points, + tensor->attr.dtype.zero_points_dim * sizeof(int32_t)); + params.quant_data.affinePerChannel.channelDim = + tensor->attr.dtype.channel_dim; + params.quant_data.affinePerChannel.scaleCount = + tensor->attr.dtype.scale_dim; + params.quant_data.affinePerChannel.scales = scales; + params.quant_data.affinePerChannel.zeroPoint = zeroPoints; + params.quant_data.affinePerChannel.zeroPointCount = tensor->attr.dtype.zero_points_dim; + break; +#else + VSILOGE( + "can't support qnt_type " + "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC."); #endif default: break; @@ -359,6 +422,13 @@ static vsi_bool _init_tensor vxReleaseWeightsBiasesParameter( &tensor->wb ); } +#if VX_STREAM_PROCESSOR_SUPPORT + if ( TRUE == tensor->attr.is_dummy ) + { + tensor->t = vxCreateDummyTensor( graph->ctx->c, + (vsi_size_t)tensor->attr.dim_num, tensor->attr.size, (vsi_enum)tensor->attr.dtype.vx_type ); + } else +#endif if( TRUE == tensor->attr.is_created_from_handle ) { vx_tensor_addressing addr; @@ -389,6 +459,10 @@ static vsi_bool _init_tensor { free(scales); } + if( zeroPoints ) + { + free(zeroPoints); + } if(null_zp) { free(null_zp); @@ -400,19 +474,31 @@ static vsi_bool _init_tensor if( data ) { #ifdef VSI_40BIT_VA_SUPPORT - addr = vxCreateTensorAddressing(graph->ctx->c, - tensor->attr.size, stride_size, (vsi_size_t)tensor->attr.dim_num); -#else { - uint32_t i, size_32bit[_cnt_of_array(tensor->attr.size)] = {0}; - uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0}; + vx_size size_vxsize[_cnt_of_array(tensor->attr.size)] = {0}; + vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0}; for(i = 0; i < _cnt_of_array(tensor->attr.size); i++) { - size_32bit[i] = (uint32_t)tensor->attr.size[i]; + size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i]; } for(i = 0; i < _cnt_of_array(stride_size); i++) { - stride_size_32bit[i] = (uint32_t)stride_size[i]; + stride_size_vxsize[i] = -1 == stride_size[i] ? -1 : (vx_size)stride_size[i]; + } + addr = vxCreateTensorAddressing(graph->ctx->c, + size_vxsize, stride_size_vxsize, (vx_size)tensor->attr.dim_num); + } +#else + { + uint32_t size_32bit[_cnt_of_array(tensor->attr.size)] = {0}; + uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0}; + for(i = 0; i < _cnt_of_array(tensor->attr.size); i++) + { + size_32bit[i] = -1 == tensor->attr.size[i] ? -1 : (uint32_t)tensor->attr.size[i]; + } + for(i = 0; i < _cnt_of_array(stride_size); i++) + { + stride_size_32bit[i] = -1 == stride_size[i] ? -1 : (uint32_t)stride_size[i]; } addr = vxCreateTensorAddressing(graph->ctx->c, size_32bit, stride_size_32bit, (uint8_t)tensor->attr.dim_num); @@ -481,6 +567,10 @@ static vsi_bool _init_tensor { free(scales); } + if (zeroPoints) + { + free(zeroPoints); + } if(null_zp) { free(null_zp); @@ -588,15 +678,23 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorWithDefault uint8_t* data = NULL; size = vsi_nn_GetStrideSize( &t->attr, stride ); + if( stride[0] == 0 ) + { + size = vsi_nn_GetElementNum(t); + } data = (uint8_t *)malloc( size ); if( data ) { vsi_size_t i = 0, j = 0; - vsi_size_t elements = size / stride[0]; + vsi_size_t elements = 0; vsi_status status = VSI_FAILURE; + if(stride[0] != 0) + { + elements = size / stride[0]; + } status = vsi_nn_Float32ToDtype( defualt_value, &data[0], &t->attr.dtype ); - if(stride[0] == 1) + if(stride[0] == 1 || stride[0] == 0) { memset(data, data[0], size); } @@ -639,14 +737,22 @@ vsi_status vsi_nn_FillTensorWithValue uint8_t* data = NULL; size = vsi_nn_GetStrideSize( &tensor->attr, stride ); + if( stride[0] == 0) + { + size = vsi_nn_GetElementNum(tensor); + } data = (uint8_t *)malloc( size ); if( data ) { vsi_size_t i = 0, j = 0; - vsi_size_t elements = size / stride[0]; + vsi_size_t elements = 0; + if(stride[0] != 0) + { + elements = size / stride[0]; + } status = vsi_nn_Float32ToDtype( value, &data[0], &tensor->attr.dtype ); - if(stride[0] == 1) + if(stride[0] == 1 || stride[0] == 0) { memset(data, data[0], size); } @@ -826,7 +932,7 @@ float * vsi_nn_ConvertTensorToFloat32Data uint8_t *tensor_data = NULL; vsi_size_t elements; vsi_size_t i; - uint32_t stride; + vsi_size_t stride; float *data; if(NULL == graph || NULL == tensor) @@ -835,7 +941,7 @@ float * vsi_nn_ConvertTensorToFloat32Data } elements = vsi_nn_GetElementNum(tensor); - stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type); + stride = vsi_nn_TypeGetBytesExt(tensor->attr.dtype.vx_type); data = NULL; data = (float *)malloc(elements * sizeof(float)); @@ -883,6 +989,7 @@ uint8_t * vsi_nn_ConvertTensorToData ) { uint8_t * data; + uint8_t * new_data; vsi_size_t buf_sz; vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; vsi_status status; @@ -929,8 +1036,23 @@ uint8_t * vsi_nn_ConvertTensorToData data = NULL; } } - return data; - + if(tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 || + tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT4) + { + vsi_size_t dest_size = vsi_nn_GetElementNum(tensor); + new_data = (uint8_t*)malloc(dest_size); + status = vsi_nn_Unpack4bitData(tensor, data, new_data, tensor->attr.dtype.vx_type); + if(data) + { + free(data); + data = NULL; + } + return new_data; + } + else + { + return data; + } } /* vsi_nn_ConvertTensorToData() */ /* @@ -1032,6 +1154,7 @@ uint8_t * vsi_nn_ConvertRawTensorToData2 status = vxQueryTensor(tensor, VX_TENSOR_FIXED_POINT_POS, &(attr->dtype.fl), sizeof(int8_t)); break; + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT, &(attr->dtype.zero_point), sizeof(int32_t)); @@ -1077,7 +1200,7 @@ void vsi_nn_SaveTensorToTextByFp32 const float c_flush_th = 0.7f; uint8_t * data; uint8_t * ptr; - uint32_t type_bytes; + vsi_size_t stride; uint8_t buf[_TENSOR_TMPBUF_SZ]; FILE * fp; float write_data; @@ -1108,14 +1231,13 @@ void vsi_nn_SaveTensorToTextByFp32 return; } sz = vsi_nn_GetElementNum( tensor ); - ptr = data; - type_bytes = vsi_nn_TypeGetBytes( tensor->attr.dtype.vx_type ); + stride = vsi_nn_TypeGetBytesExt( tensor->attr.dtype.vx_type ); count = 0; for( i = 0; i < sz; i ++ ) { vsi_nn_DtypeToFloat32( ptr, &write_data, &tensor->attr.dtype ); - ptr += type_bytes; + ptr += stride; count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count, "%f%s", write_data, seperator ); @@ -1173,7 +1295,7 @@ void vsi_nn_SaveDataToText uint8_t buf[_TENSOR_TMPBUF_SZ]; FILE * fp; float write_data; - uint32_t type_bytes; + vsi_size_t stride; vsi_size_t i; uint32_t count; @@ -1197,14 +1319,15 @@ void vsi_nn_SaveDataToText VSILOGW( "Write file %s fail. Please check...", filename ); return; } - type_bytes = vsi_nn_GetTypeBytes( type ); + stride = vsi_nn_TypeGetBytesExt( type ); count = 0; for( i = 0; i < data_size; i ++ ) { - write_data = vsi_nn_DataAsFloat32( &data[type_bytes * i], + write_data = vsi_nn_DataAsFloat32( &data[stride * i], type ); - if( type == VSI_NN_TYPE_UINT8 || type == VSI_NN_TYPE_INT8 ) + if( type == VSI_NN_TYPE_UINT8 || type == VSI_NN_TYPE_INT8 || + type == VSI_NN_TYPE_UINT4 || type == VSI_NN_TYPE_INT4 ) { count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count, "%d%s", (int32_t)write_data, seperator ); @@ -1285,7 +1408,6 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorFromData tensor = vsi_nn_CreateTensor( graph, attr ); status = vsi_nn_CopyDataToTensor( graph, tensor, data ); - if( VSI_SUCCESS != status ) { VSILOGE("Create tensor from data fail."); @@ -1326,11 +1448,31 @@ vsi_status vsi_nn_CopyDataToTensor } else { - status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, data, VX_WRITE_ONLY); + if( tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 || + tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT4 ) + { + uint8_t* new_data = NULL; + vsi_size_t dest_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, + tensor->attr.dtype.vx_type); + new_data = (uint8_t*)malloc( dest_size ); + status = vsi_nn_Pack4bitData(tensor, (uint8_t*)data, new_data); + status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, new_data, VX_WRITE_ONLY ); + if( new_data ) + { + free( new_data ); + new_data = NULL; + } + } + else + { + status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, data, VX_WRITE_ONLY ); + } } + return status; } /* vsi_nn_CopyDataToTensor() */ + vsi_status vsi_nn_FlushHandle ( const vsi_nn_tensor_t * tensor @@ -1515,18 +1657,7 @@ vsi_bool vsi_nn_ReshapeTensor } /* Create reshape tensor */ -#ifdef VSI_40BIT_VA_SUPPORT - output->t = vxReshapeTensor( input->t, new_shape, dim_num ); -#else - { - uint32_t i, new_shape_32bit[VSI_NN_MAX_DIM_NUM] = {0}; - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - new_shape_32bit[i] = (uint32_t)new_shape[i]; - } - output->t = vxReshapeTensor( input->t, (int32_t *)new_shape_32bit, (uint32_t)dim_num ); - } -#endif + output->t = vsi_nn_safe_reshape_tensor( input->t, (void*)new_shape, (vsi_size_t)dim_num, sizeof(new_shape[0]) ); if( NULL == output->t ) { ret = FALSE; @@ -1596,6 +1727,55 @@ void vsi_nn_TransposeTensor free( dst ); } /* vsi_nn_TransposeTensor() */ +vx_tensor vsi_nn_safe_reshape_tensor + ( + vx_tensor tensor, + void * num_of_dims, + vsi_size_t sizes, + vsi_size_t size_of_shape_element + ) +{ + if(sizeof(vx_size) == size_of_shape_element) + { + vx_size* num_of_dims_vxsize = (vx_size*)num_of_dims; + #ifdef VSI_40BIT_VA_SUPPORT + return vxReshapeTensor( tensor, num_of_dims_vxsize, (vx_size)sizes ); + #else + { + int32_t new_shape_int32[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t i = 0; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + new_shape_int32[i] = -1 == num_of_dims_vxsize[i] ? -1 : (int32_t)num_of_dims_vxsize[i]; + } + return vxReshapeTensor( tensor, new_shape_int32, (uint32_t)sizes ); + } + #endif + } + else if(sizeof(int32_t) == size_of_shape_element) + { + int32_t* num_of_dims_int32 = (int32_t*)num_of_dims; + #ifdef VSI_40BIT_VA_SUPPORT + { + vx_size new_shape_vxsize[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t i = 0; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + new_shape_vxsize[i] = -1 == num_of_dims_int32[i] ? -1 : (vx_size)num_of_dims_int32[i]; + } + return vxReshapeTensor( tensor, new_shape_vxsize, (vx_size)sizes ); + } + #else + return vxReshapeTensor( tensor, num_of_dims_int32, (uint32_t)sizes ); + #endif + } + else + { + VSILOGE("couldn't handle tensor shape element with length of %"VSI_SIZE_T_SPECIFIER"", size_of_shape_element); + return NULL; + } +} /* vsi_nn_safe_reshape_tensor() */ + void vsi_nn_PermuteTensor ( vsi_nn_graph_t * graph, @@ -1649,11 +1829,8 @@ void vsi_nn_PermuteTensor } vsi_nn_Permute( dst, buf, shape_ptr, dim_num, perm, tensor->attr.dtype.vx_type ); memcpy(tensor->attr.size, dst_shape, sizeof(dst_shape)); -#ifdef VSI_40BIT_VA_SUPPORT - tensor->t = vxReshapeTensor(tensor->t, tensor->attr.size, tensor->attr.dim_num); -#else - tensor->t = vxReshapeTensor(tensor->t, (int32_t*)tensor->attr.size, tensor->attr.dim_num); -#endif + tensor->t = vsi_nn_safe_reshape_tensor(tensor->t, (void*)tensor->attr.size, + (vsi_size_t)tensor->attr.dim_num, sizeof(tensor->attr.size[0])); status = vsi_nn_CopyDataToTensor( graph, tensor, dst ); if( VSI_SUCCESS != status ) { @@ -1674,8 +1851,7 @@ vsi_size_t vsi_nn_GetElementNum return 0; } - return get_tensor_elements_num(tensor->attr.size, - tensor->attr.dim_num, tensor->attr.dtype.vx_type); + return vsi_nn_ShapeProduct((vsi_size_t*)tensor->attr.size, tensor->attr.dim_num); } /* vsi_nn_GetElementNum() */ vsi_size_t vsi_nn_GetTensorSize @@ -1687,17 +1863,32 @@ vsi_size_t vsi_nn_GetTensorSize { vsi_size_t sz; vsi_size_t i; + vsi_size_t bits_num; sz = 0; if( NULL == shape || 0 == dim_num ) { return sz; } - sz = 1; - for( i = 0; i < dim_num; i ++ ) + bits_num = vsi_nn_TypeGetBits( type ); + if( bits_num < BITS_PER_BYTE ) + { + if(shape[0] % 2 == 0) + { + sz = shape[0] / 2; + } + else + { + sz = shape[0] / 2 + shape[0] % 2; + } + } + else + { + sz = shape[0] * bits_num / BITS_PER_BYTE; + } + for( i = 1; i < dim_num; i ++ ) { sz *= shape[i]; } - sz *= vsi_nn_GetTypeBytes( type ); return sz; } /* vsi_nn_GetTensorSize() */ @@ -2040,6 +2231,7 @@ vsi_status vsi_nn_vxGetTensorAttr &(attr->dtype.fl), sizeof(int8_t)); TEST_CHECK_STATUS( status, final ); break; + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT, &(attr->dtype.zero_point), sizeof(int32_t)); @@ -2154,7 +2346,36 @@ vsi_status vsi_nn_copy_tensor_veiw_patch } #ifdef USE_OPENVX_1_2 + +#ifdef VX_TENSOR_STRIDE_X_BITS_SUPPORT + { + vx_trensor_addressing addr = NULL; + vx_size dim_sizes[VSI_NN_MAX_DIM_NUM], strides[VSI_NN_MAX_DIM_NUM]; + addr = (vx_trensor_addressing)malloc(sizeof(vx_tensorpatch_addressing_t)); + addr->num_of_dims = (vx_uint32)attr->dim_num; + for(i = 0; i < dim; i++) + { + strides[i] = (vx_size)vstride[i]; + dim_sizes[i] = (vx_size)attr->size[i]; + } + addr->strides = strides; + addr->dim_sizes = dim_sizes; + if(attr->dtype.vx_type == VSI_NN_TYPE_INT4 || attr->dtype.vx_type == VSI_NN_TYPE_UINT4) + { + addr->strides[0] = 0; + addr->stride_x_bits = 4; + } + status = vxCopyTensorPatch2(tensor, dim, vstart, vend, addr,sizeof(vx_tensorpatch_addressing_t), + user_ptr, usage, user_memory_type); + if(addr) + { + free(addr); + addr = NULL; + } + } +#else status = vxCopyTensorPatch(tensor, dim, vstart, vend, vstride, user_ptr, usage, user_memory_type); +#endif #else { vx_context context = NULL; @@ -2455,3 +2676,48 @@ vsi_bool vsi_nn_ConvertTensor return ret; } + +vsi_nn_tensor_t * vsi_nn_dropout_tensor + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t * input, + float rate + ) +{ + vsi_nn_tensor_t *output = NULL; + vsi_size_t size = 0; + vsi_size_t i = 0; + float* data = NULL; + + if (NULL == input || NULL == graph) + { + return NULL; + } + + output = vsi_nn_CreateTensor(graph, &input->attr); + if ( !output ) + { + VSILOGE("create tensor failed."); + goto final; + } + + data = vsi_nn_ConvertTensorToFloat32Data(graph, input); + if (NULL == data) + { + goto final; + } + + size = vsi_nn_vxGetTensorElementNum(&input->attr); + + for (i = 0; i < size; i++) + { + data[i] = data[i] * rate; + } + + vsi_nn_CopyRawDataToTensor( graph, (uint8_t *)data, &input->attr.dtype, output ); + +final: + vsi_nn_safe_free(data); + + return output; +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/vsi_nn_version.c b/src/tim/vx/internal/src/vsi_nn_version.c index d7abca3..ab94abe 100644 --- a/src/tim/vx/internal/src/vsi_nn_version.c +++ b/src/tim/vx/internal/src/vsi_nn_version.c @@ -50,4 +50,26 @@ uint32_t vsi_nn_GetVersionMinor(void) uint32_t vsi_nn_GetVersionPatch(void) { return VSI_NN_VERSION_PATCH; -} \ No newline at end of file +} + +const char **vsi_nn_get_feature_config(void) +{ + static const char *p[10]; + int i = 0; + #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT + { + static const char *perchannel_quantization = MACRO_TO_STRING(VSI_PERCHANNEL_QUANTIZATION_SUPPORT); + (void)perchannel_quantization; + p[i++] = perchannel_quantization; + } + #endif + + #ifdef VSI_40BIT_VA_SUPPORT + { + static const char *va40bit = MACRO_TO_STRING(VSI_40BIT_VA_SUPPORT); + (void)va40bit; + p[i++] = va40bit; + } + #endif + return p; +} diff --git a/src/tim/vx/internal/tim_internal.cmake b/src/tim/vx/internal/tim_internal.cmake index b52a034..6f93896 100644 --- a/src/tim/vx/internal/tim_internal.cmake +++ b/src/tim/vx/internal/tim_internal.cmake @@ -15,12 +15,12 @@ aux_source_directory(./vx/internal/src/quantization INTERNAL_QUANTIZATION) aux_source_directory(./vx/internal/src/custom/ops INTERNAL_CUSTOM_OPS) aux_source_directory(./vx/internal/src/custom/ops/kernel INTERNAL_CUSTOM_OPS_KERNEL) aux_source_directory(./vx/internal/src/utils INTERNAL_UTILS) +aux_source_directory(./vx/internal/src/POST POST) list(APPEND ${TARGET_NAME}_SRCS ${INTERNAL_SRC} ${INTERNAL_KERNEL} ${INTERNAL_KERNEL_CL} - ${INTERNAL_KERNEL_CPU} ${INTERNAL_KERNEL_EVIS} ${INTERNAL_KERNEL_VX} ${INTERNAL_OPS} @@ -29,4 +29,5 @@ list(APPEND ${TARGET_NAME}_SRCS ${INTERNAL_CUSTOM_OPS} ${INTERNAL_CUSTOM_OPS_KERNEL} ${INTERNAL_UTILS} + ${POST} )