diff --git a/.gitignore b/.gitignore index 8def752..cef9f4c 100644 --- a/.gitignore +++ b/.gitignore @@ -336,3 +336,4 @@ ASALocalRun/ # IDE .settings/ build/ +*_build/ diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION index 703a8d3..e5a66ba 100644 --- a/prebuilt-sdk/x86_64_linux/VERSION +++ b/prebuilt-sdk/x86_64_linux/VERSION @@ -1 +1 @@ -REL/6.4.10.2 +6.4.11 diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h index a8ea910..8b93beb 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h @@ -499,6 +499,8 @@ enum vx_kernel_e { VX_KERNEL_NN_DECONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x31, + VX_KERNEL_STREAM_PROCESSOR = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x32, + VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */ }; diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h index 74f3592..6cf283c 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h @@ -196,4 +196,45 @@ VX_DECONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support d #define VX_TENSOR_STRIDE_X_BITS_SUPPORT 1 #endif +/* +VX_REMOVE_RESHAPE_SUPPORT is used to declare if graph opt support to remove reshape op, if support, it's not need to remove reshape in ovxlib. + 0: not support + 1: support +*/ +/* +#ifndef VX_REMOVE_RESHAPE_SUPPORT +#define VX_REMOVE_RESHAPE_SUPPORT 0 +#endif +*/ + +/* +VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can support vxStreamProcessorNode API + [value] + 0: not support + 1: support +*/ +#ifndef VX_STREAM_PROCESSOR_SUPPORT +#define VX_STREAM_PROCESSOR_SUPPORT 0 +#endif + +/* + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL is used to declare that this tensor connect to fixed DMA channel. + [value] + 0: not support + 1: support +*/ +#ifndef VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL +#define VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL 1 +#endif + +/* + VX_SCALE_EXTRA_PARAMETER_SUPPORT is used to declare that RESIZE can support align_cornor and half_pixel_center parameter + [value] + 0: not support + 1: support +*/ +#ifndef VX_SCALE_EXTRA_PARAMETER_SUPPORT +#define VX_SCALE_EXTRA_PARAMETER_SUPPORT 1 +#endif + #endif /* __VX_KHR_COMPATIBLE_H__ */ diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h index cca4338..623c541 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h @@ -57,6 +57,12 @@ enum vx_graph_attribute_internal_type_e VX_GRAPH_AXI_SRAM_PRE_LOAD = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x2, /*! \brief Queries a graph for its running priority (read-write. Use a \ref vx_uint32 parameter. */ VX_GRAPH_PRIORITY_VALUE_VIV = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x3, + VX_GRAPH_PSI_EXTRATOR_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x4, + VX_GRAPH_PSI_FILLER_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x5, + VX_GRAPH_DENOISE_POSTPROCESS_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x6, + VX_GRAPH_DATA_COMPRESSION_RATIO = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x7, + VX_GRAPH_ISP_EMULATION_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x8, + VX_GRAPH_PROCESS_FPS = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x9, }; /*! \brief Size Alignment of User Memory @@ -209,7 +215,8 @@ enum vx_nn_activation_function_e VX_NN_ACTIVATION_LEAKYRELU_MAX_POOLING = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x4, VX_NN_ACTIVATION_SWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x5, VX_NN_ACTIVATION_HSWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x6, - VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7, + VX_NN_ACTIVATION_CUSTOM = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7, + VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x8, }; /*! \brief The Convolutional network type @@ -285,6 +292,59 @@ enum vx_tensor_rank_type_e VX_TENSOR_RANK_SN, }; +/*! \brief The attribute of tensor. + * \ingroup group_tensor + * \version 0.4 + */ +enum vx_tensor_priority_e +{ + /*! \brief no special requirement */ + VX_TENSOR_DEFAULT = 0, + + /*! \brief 2nd input(reference) */ + /*VX_TENSOR_2ND_INPUT_FOR = 1,*/ + VX_TENSOR_FOR_GRAPH_REFERENCE = 1, +}; + + +/*! \brief The attribute of tensor memory. + * \ingroup group_tensor + * \version 0.4 + */ +enum vx_tensor_memory_attribute_e +{ + /*! \brief no special requirement */ + VX_TENSOR_MEMORY_DEFAULT = 0, + + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_0 = (0x1 << 0), + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_1 = (0x1 << 1), + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_2 = (0x1 << 2), + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_3 = (0x1 << 3), + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_4 = (0x1 << 4), + /* + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_5 = (0x1 << VX_DMA5_IN_ISP_OCM_PSI), + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_6 = (0x1 << VX_DMA6_DDR_DECOMPRESS), + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_7 = (0x1 << VX_DMA7_POSTOUT_OCM_ISP), + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_8 = (0x1 << VX_DMA8_COMPRESS_DDR), + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_9 = (0x1 << VX_DMA9_ISP_PATTERN_GENERATOR), + VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_10 = (0x1 << VX_DMA10_ISP_CHECKSUM_GENERATOR), + */ + /*! \brief DMA transfer data to VIP and enable circular buffer */ +#if !VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL + VX_TENSOR_MEMORY_ENABLE_CIRCULAR_BY_DMA = 0xFFFFFFFF, +#endif +}; + +enum vx_dma_extrator_pad_mode_e +{ + /*! \brief no special requirement */ + VX_DMA_EXTRATOR_PAD_CONST = 0, + + /*! \brief DMA extrator pad with nearest edge */ + VX_DMA_EXTRATOR_PAD_WITH_NEAREAST_EDGE = 1, +}; + + /*! \brief The precision of tensor. * \ingroup group_tensor * \version 0.4 @@ -601,6 +661,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_int32* n */ VX_API_ENTRY vx_status VX_API_CALL vxSetTensorAttribute(vx_tensor tensor, vx_enum attribute, const void *ptr, vx_size size); +/*! \brief Creates an opaque reference to a tensor data buffer. + * \details The tensor is a dummy tensor which will not allocate any memory. And it cannot reshape or view. + * Not guaranteed to exist until the vx_graph containing it has been verified. + * \param [in] context The reference to the implementation context. + * \param [in] number_of_dims The number of dimensions. + * \param [in] dims Dimensions sizes in elements. + * \param [in] data_format The \ref vx_type_e that represents the data format of the tensor data elements. + * \return A tensor data reference or zero when an error is encountered. + * \ingroup group_tensor + * \version 0.3 + */ +VX_API_ENTRY vx_tensor VX_API_CALL vxCreateDummyTensor(vx_context context, vx_size number_of_dims, const vx_size *dims, vx_enum data_format); + /*! \brief The type enumeration lists all NN extension types. * \ingroup group_cnn @@ -1317,6 +1390,13 @@ typedef struct _vx_nn_scale_params_t vx_enum type; /*!< \brief The interpolation type, only support VX_INTERPOLATION_BILINEAR. */ } vx_nn_scale_params_t, * vx_nn_scale_params; +typedef struct _vx_nn_scale_params_ext_t +{ + vx_nn_scale_params_t base; + vx_bool align_corners; + vx_bool half_pixel_centers; +} vx_nn_scale_params_ext_t, * vx_nn_scale_params_ext; + /*! \brief [Graph] Creates a scale Layer Node. * \param [in] graph The reference to the parent graph. * \param [in] input The input tensor data to scale. @@ -2054,8 +2134,15 @@ typedef struct _vx_hardware_caps_params_ext_t vx_hardware_caps_params_t base; vx_uint32 subGroupSize; /*!< \brief shader sub-group size.*/ vx_bool supportVA40; /*!< \brief support 40bit virtual address.*/ + vx_uint32 supportStreamProcessor; /*!< \brief support stream processor.*/ } vx_hardware_caps_params_ext_t; +typedef struct _vx_hardware_caps_params_ext2_t +{ + vx_hardware_caps_params_ext_t base; + vx_uint32 streamProcessorExecCount; /*!< \brief streamprocess execution count. */ +} vx_hardware_caps_params_ext2_t; + /*! \brief Queries hardware caps information. * \param [in] context The reference to the context. * \param [in] hardware_caps_params \ref vx_hardware_caps_params_t . diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h index 506938f..66427cb 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h @@ -219,6 +219,15 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext4_t vx_bool enable_nn_tensor_add_relu; /*!< \brief Enable Relu function after tensor add. */ } vx_nn_convolution_relu_pooling_params_ext4_t, * vx_nn_convolution_relu_pooling_params_ext4; +typedef struct _vx_nn_convolution_relu_pooling_params_ext5_t +{ + vx_nn_convolution_relu_pooling_params_ext4_t ext4; /*!< \brief convolution relu pooling params \ref vx_nn_convolution_relu_pooling_params_ext_t */ + + vx_object_array inputs_list; + vx_object_array outputs_list; + vx_spinst spinst_obj; +} vx_nn_convolution_relu_pooling_params_ext5_t, * vx_nn_convolution_relu_pooling_params_ext5; + /*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion. * \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer. * For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined, diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h index bf513b5..64504ca 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_nodes.h @@ -963,6 +963,40 @@ VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmNode(vx_graph graph, vx_scalar trans_c, vx_tensor output); +typedef struct _vx_lut_params_s +{ + vx_enum lut_function; /*!< \brief Set VX_NN_ACTIVATION_NONE to disable lut table or set VX_NN_ACTIVATION_CUSTOM to customize lut table or set others to use fixed lut table */ + vx_float32 float_values[4]; /*!< \brief Float parameters of fixed lut table */ + vx_uint32 fvalues_count; /*!< \brief Count of float_values */ + vx_int32 int_values[4]; /*!< \brief Int parameters of fixed lut table */ + vx_uint32 ivalues_count; /*!< \brief Count of int_values */ + vx_lut in_lut; /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */ + vx_lut out_lut; /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */ +} vx_lut_params_s, * vx_lut_params; + +/*! \brief Create a stream processor node. + * \param [in] graph The reference to the graph. + * \param [in] input_list The input tensor list. + * \param [in] input_count The input tensor count. + * \param [in] output_list The output tensor list. + * \param [in] output_count The output tensor count. + * \param [in] spinst_obj The stream processor instrunction object. Use vxCreateSPINST() to create. + * \param [in] lut_params The lut parameters. Refer to vx_lut_params_s. + * \return \ref vx_node. + * \retval vx_node A node reference. Any possible errors preventing a successful creation + * should be checked using \ref vxGetStatus + * \ingroup group_vision_function_sp + */ +VX_API_ENTRY vx_node VX_API_CALL vxStreamProcessorNode( + vx_graph graph, + vx_tensor* input_list, + vx_uint32 input_count, + vx_tensor* output_list, + vx_uint32 output_count, + vx_spinst spinst_obj, + vx_lut_params lut_params + ); + #ifdef __cplusplus } #endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h new file mode 100644 index 0000000..bcfe401 --- /dev/null +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_spinst.h @@ -0,0 +1,332 @@ +/**************************************************************************** +* +* Copyright 2017 - 2021 Vivante Corporation, Santa Clara, California. +* All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* 'Software'), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sub license, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject +* to the following conditions: +* +* The above copyright notice and this permission notice (including the +* next paragraph) shall be included in all copies or substantial +* portions of the Software. +* +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. +* IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VX_SPINST_H_ +#define _VX_SPINST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum _vx_sp_inst_type_e +{ + VX_SP_INST_TYPE_FADD, + VX_SP_INST_TYPE_FMULT, + VX_SP_INST_TYPE_MOVE, + VX_SP_INST_TYPE_PWL, + + VX_SP_INST_TYPE_COUNT, +} +vx_sp_inst_type_e; + +typedef enum _vx_sp_inst_type_fadd_e +{ + VX_SP_INST_TYPE_FADD_IDLE, // FADD-IDLE + VX_SP_INST_TYPE_FADD_ADD, // dst = src0 + src1 + VX_SP_INST_TYPE_FADD_SUB, // dst = src0 - src1 + + VX_SP_INST_TYPE_FADD_COUNT, +} +vx_sp_inst_type_fadd_e; + +typedef enum _vx_sp_inst_type_fmult_e +{ + VX_SP_INST_TYPE_FMULT_IDLE, /* FMULT-IDLE */ + VX_SP_INST_TYPE_FMULT_MUL, /* dst = src0 * src1 */ + VX_SP_INST_TYPE_FMULT_MUL_CLAMP, /* dst = clamp (src0, src1, R6, R7) */ + + VX_SP_INST_TYPE_FMULT_COUNT, +} +vx_sp_inst_type_fmult_e; + +typedef enum _vx_sp_inst_type_move_e +{ + VX_SP_INST_TYPE_MOVE_IDLE, + VX_SP_INST_TYPE_MOVE_MOVE, // dst = src1 + VX_SP_INST_TYPE_MOVE_SEL0, // dst = (src0 > 0) ? src1[0] : src1[1] + VX_SP_INST_TYPE_MOVE_SEL1, // dst = (src0 > 0) ? src1 : FA-src0 // use FA's SRC0 + VX_SP_INST_TYPE_MOVE_IMMD, // dst = Constant assign immmediate + VX_SP_INST_TYPE_MOVE_ABS, // dst = abs(src1) + + VX_SP_INST_TYPE_MOVE_COUNT, +} +vx_sp_inst_type_move_e; + +typedef enum _vx_sp_inst_type_pwl_e +{ + VX_SP_INST_TYPE_PWL_IDLE, + VX_SP_INST_TYPE_PWL_SETUP_0, /* PWL ID = 0 */ + VX_SP_INST_TYPE_PWL_SETUP_1, /* Sigmode() */ + VX_SP_INST_TYPE_PWL_SETUP_2, /* Tanh() */ + + VX_SP_INST_TYPE_PWL_COUNT, +} +vx_sp_inst_type_pwl_e; + +typedef enum _vx_sp_inst_src_dst_e +{ + VX_SP_INST_SPINOUT, + VX_SP_INST_SR1, + VX_SP_INST_SR2, + VX_SP_INST_SR3, + VX_SP_INST_SR4, + VX_SP_INST_SR5, + VX_SP_INST_SR6, /* nn_clamp_min */ + VX_SP_INST_SR7, /* nn_clamp_max */ + VX_SP_INST_SR8, + VX_SP_INST_SR9, + VX_SP_INST_SR10, + VX_SP_INST_VR11, + VX_SP_INST_VR12, + VX_SP_INST_VR13, + VX_SP_INST_VR14, + VX_SP_INST_SETUPOUT, /* Input of PWL Mult and Add: FMInA, FMInB, FAInA, FAInB */ +} +vx_sp_inst_src_dst_e; + +typedef struct _vx_spinst_unit_param +{ + vx_enum op; /* vx_sp_inst_type_e */ + + struct + { + vx_enum op; /* vx_sp_inst_type_fadd/fmult/move/pwl_e */ + + struct + { + vx_uint8 src0; /* vx_sp_inst_src_dst_e */ + vx_uint8 src1; /* vx_sp_inst_src_dst_e */ + vx_uint8 dst; /* vx_sp_inst_src_dst_e */ + vx_float32 constant; + } var; + + } sub; + +} +vx_spinst_unit_param; + +/**********************************************************************************************/ + +typedef enum _vx_sp_attribute_e +{ + VX_SP_ATTRIBUTE_NONE, + + VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING, + VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_X, + VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Y, + VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Z, + + VX_SP_ATTRIBUTE_PROG_INIT_INSTR_NUM, + VX_SP_ATTRIBUTE_PROG_LOOP_INSTR_NUM, + VX_SP_ATTRIBUTE_PROG_COMPLETE_INSTR_NUM, + VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE, + VX_SP_ATTRIBUTE_INPUT_SETUP, + + VX_SP_ATTRIBUTE_IGNORED_LEADING_OUTPUTS, + VX_SP_ATTRIBUTE_FLUSH_CYCLE_NUM, + VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_WR, + VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_WR, + VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_RD, + VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_RD, + + VX_SP_ATTRIBUTE_CH0_POST_REDISTRIBUTE, + VX_SP_ATTRIBUTE_CH1_POST_REDISTRIBUTE, + VX_SP_ATTRIBUTE_V11_RESET_AT_START, + VX_SP_ATTRIBUTE_V12_RESET_AT_START, + VX_SP_ATTRIBUTE_V11_POP_CONFIG, + VX_SP_ATTRIBUTE_V12_POP_CONFIG, + VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT, + VX_SP_ATTRIBUTE_IGNORED_LEADING_ACC_OUT, + VX_SP_ATTRIBUTE_SUM_ENGINE_RESET, + VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL, + VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE, + VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE, + + VX_SP_ATTRIBUTE_GENERAL_COUNT, + + VX_SP_ATTRIBUTE_CONST0, /* NN post multiplier */ + VX_SP_ATTRIBUTE_CONST1, /* NN neg pos multiplier */ + VX_SP_ATTRIBUTE_CONST2, /* NN tensor add const */ + VX_SP_ATTRIBUTE_CONST3, /* NN clamp max */ + VX_SP_ATTRIBUTE_CONST4, /* NN clmap min */ + + VX_SP_ATTRIBUTE_TOTAL_COUNT, +} +vx_sp_attribute_e; + +typedef enum _vx_sp_attribute_input_tile_mapping_e +{ + VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_XYMERGE, + VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_YZMERGE, +} +vx_sp_attribute_input_tile_mapping_e; + +typedef enum _vx_sp_attribute_output_collapse_e +{ + VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_DISABLED, + VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_ENABLED, +} +vx_sp_attribute_output_collapse_e; + +typedef enum _vx_sp_attribute_rounding_mode_e +{ + VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_RTNE, + VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_STICKY, +} +vx_sp_attribute_rounding_mode_e; + +typedef enum _vx_sp_attribute_input_setup_e +{ + VX_SP_ATTRIBUTE_INPUT_SETUP_SINGLE_INPUT, + VX_SP_ATTRIBUTE_INPUT_SETUP_INTERLEAVE_TWO_INPUTS, + VX_SP_ATTRIBUTE_INPUT_SETUP_V11, + VX_SP_ATTRIBUTE_INPUT_SETUP_V12, +} +vx_sp_attribute_input_setup_e; + +typedef enum _vx_sp_attribute_ch_post_redistribute_e +{ + VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_DISABLED, + VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_SCALAR_GATHER, + VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_GATHER, + VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_SCATTER, +} +vx_sp_attribute_ch_post_redistribute_e; + +typedef enum _vx_sp_attribute_v_reset_at_start_e +{ + VX_SP_ATTRIBUTE_V_RESET_AT_START_NONE, + VX_SP_ATTRIBUTE_V_RESET_AT_START_RESET, +} +vx_sp_attribute_v_reset_at_start_e; + +typedef enum _vx_sp_attribute_v_pop_config_e +{ + VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_READ, + VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_ROW, +} +vx_sp_attribute_v_pop_config_e; + +typedef enum _vx_sp_attribute_accelerator_input_select_e +{ + VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_OUTPUT, + VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_ACCLERATOR, +} +vx_sp_attribute_accelerator_input_select_e; + +typedef enum _vx_sp_attribute_sum_engine_reset_e +{ + VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_NONE, + VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_RESET, +} +vx_sp_attribute_sum_engine_reset_e; + +typedef enum _vx_sp_attribute_sum_engine_control_e +{ + VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_INTERNAL, + VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_1D, + VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_2D, +} +vx_sp_attribute_sum_engine_control_e; + +typedef enum _vx_sp_attribute_sum_engine_num_ch_minus_one_e +{ + VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_ONE_CH, + VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_TWO_CH, +} +vx_sp_attribute_sum_engine_num_ch_minus_one_e; + +typedef enum _vx_sp_attribute_sum_engine_2d_accum_storage_e +{ + VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_SAME, + VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_DIFFERENT, +} +vx_sp_attribute_sum_engine_2d_accum_storage_e; + +/**********************************************************************************************/ + +/*! \brief Creates an opaque reference to a spinst data. + * \param [in] context The reference to the implementation context. + * \return A spinst data reference. + * \Any possible errors preventing a successful creation should be checked using \ref vxGetStatus. + * \ingroup group_object_spinst + */ +VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST( + vx_context context + ); + +/*! \brief Releases a reference to a spinst object. + * The object may not be garbage collected until its total reference count is zero. + * \param [in] spinst_obj The pointer to the spinst data to release. + * \post After returning from this function the reference is zeroed. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors; all other values indicate failure + * \retval * An error occurred. See \ref vx_status_e. + * \ingroup group_object_spinst + */ +VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST( + vx_spinst *spinst_obj + ); + +/*! \brief Add a instruction to spinst object. + * \param [in] spinst_obj The reference to the spinst object. + * \param [in] inst_unit_array The units of one instruction. Use a \ref vx_spinst_unit_param. + * \param [in] inst_unit_count The count of instruction units. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If data is not a \ref spinst_obj. + * \retval VX_ERROR_INVALID_PARAMETERS If any of parameters is incorrect. + * \retval VX_ERROR_NO_MEMORY If fail to allocate internal instruction memory. + * \ingroup group_object_spinst + */ +VX_API_ENTRY vx_status VX_API_CALL vxAddOneInstToSPINST( + vx_spinst spinst_obj, + vx_spinst_unit_param* inst_unit_array, + vx_uint8 inst_unit_count + ); + +/*! \brief Set various attributes of a spinst data. + * \param [in] spinst_obj The reference to the vx_spinst object to set. + * \param [in] attribute The attribute to set. Use a \ref vx_sp_attribute_e. + * \param [in] value The value of attribute. + * \return A \ref vx_status_e enumeration. + * \retval VX_SUCCESS No errors. + * \retval VX_ERROR_INVALID_REFERENCE If data is not a \ref vx_spinst. + * \retval VX_ERROR_INVALID_PARAMETERS If any of attribute is incorrect. + * \ingroup group_object_spinst + */ +VX_API_ENTRY vx_status VX_API_CALL vxSetAttributeToSPINST( + vx_spinst spinst_obj, + vx_enum attribute, + vx_uint32 value + ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h index 0dbdcc8..e10a32e 100644 --- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h +++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h @@ -342,6 +342,10 @@ typedef struct _vx_tensorpatch_addressing_t * vx_trensor_addressing; */ typedef struct _vx_weights_biases_parameter_s * vx_weights_biases_parameter; +/*! \brief The object for stream processor + * \ingroup group_spinst + */ +typedef struct _vx_spinst_s * vx_spinst; /*! \brief A Boolean value. * This allows 0 to be FALSE, as it is in C, and any non-zero to be TRUE. @@ -470,6 +474,7 @@ enum vx_type_e { /* \todo add new object types here */ VX_TYPE_BFLOAT16 = 0x81A,/*!< \brief A \ref vx_bfloat16. */ + VX_TYPE_SPINST = 0x81B,/*!< \brief A \ref vx_spinst. */ VX_TYPE_INT4 = 0x81C,/*!< \brief A \ref signed 4bits tensor.. */ VX_TYPE_UINT4 = 0x81D,/*!< \brief A \ref unsigned 4bits tensor.. */ }; @@ -1021,6 +1026,8 @@ enum vx_node_attribute_e { VX_NODE_ATTRIBUTE_CONST_TENSOR_CACHE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x9, + VX_NODE_ATTRIBUTE_FOR_HW_QUALITY = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xA, + }; /*! \brief The parameter attributes list @@ -1290,6 +1297,9 @@ enum vx_tensor_attribute_e VX_TENSOR_LIFETIME = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x5, /*! \brief the value status of tensor. */ VX_TENSOR_VALUE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x6, + /*XiaoMi project*/ + VX_TENSOR_INPUT_FOR_REFERENCE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x7, + VX_TENSOR_MEMORY_ATTRIBUTE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x8, }; /*! \brief The meta valid rectangle attributes. diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so index 5f9565c..793d7a3 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so index d278960..b86b927 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so index 213d250..1b79027 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so index 434ffc4..06d1f8a 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so index d88e0ce..62fca2c 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so deleted file mode 120000 index 664ae82..0000000 --- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so +++ /dev/null @@ -1 +0,0 @@ -libOpenVX.so.1.3.0 \ No newline at end of file diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so new file mode 100755 index 0000000..6d83612 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 deleted file mode 120000 index 664ae82..0000000 --- a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 +++ /dev/null @@ -1 +0,0 @@ -libOpenVX.so.1.3.0 \ No newline at end of file diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 new file mode 100755 index 0000000..6d83612 Binary files /dev/null and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1 differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 index ebea7d4..6d83612 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so index ee7b8f8..6658be6 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so index 2339562..2c6a14b 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so index bb370e9..3621c9a 100755 Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index fa4dc17..ae52716 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -172,3 +172,10 @@ DEF_OP(PRE_PROCESS_RGB888_PLANAR) DEF_OP(GATHER_ELEMENTS) DEF_OP(SELU) DEF_OP(CELU) +DEF_OP(MAX_POOL3D) +DEF_OP(RCP) +DEF_OP(SIGN) +DEF_OP(SOFTSIGN) +DEF_OP(CUMSUM) +DEF_OP(MAXPOOLWITHARGMAX) +DEF_OP(MOD) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h b/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h index a7ce5e3..a5dcb34 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_gpu_config.h @@ -25,10 +25,13 @@ #ifndef _VSI_NN_GPU_CONFIG_H #define _VSI_NN_GPU_CONFIG_H -#define GPU_TENSOR_MAX_WIDTH (65536) +#ifdef VSI_40BIT_VA_SUPPORT +#define GPU_TENSOR_MAX_WIDTH (1 << 30) +#else +#define GPU_TENSOR_MAX_WIDTH (1 << 16) +#endif #define GPU_MAX_MULTIPLIER_NUM (65535) #define GPU_MAX_POST_SHIFT_BITS (31) #define GPU_TENSOR_DIM_2 (2) #endif - diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h index 501dd5d..7d75720 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h @@ -156,6 +156,8 @@ typedef struct vsi_nn_kernel_quant_asymm_t asymm; vsi_nn_kernel_quant_asymm_perchannel_t asymm_v; }; + float scale; + int32_t zero_point; } vsi_nn_kernel_tensor_attr_t; typedef struct @@ -411,7 +413,7 @@ vsi_status vsi_nn_kernel_node_pass_param size_t num ); -static inline void vsi_nn_kernel_node_release +static VSI_INLINE_API void vsi_nn_kernel_node_release ( vsi_nn_kernel_node_t * node ) @@ -422,7 +424,7 @@ static inline void vsi_nn_kernel_node_release } } -static inline void vsi_nn_kernel_node_pack_io +static VSI_INLINE_API void vsi_nn_kernel_node_pack_io ( vsi_nn_kernel_node_param_t * params, size_t param_num, @@ -476,7 +478,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector ); /** Map data type to gpu internal dtype. */ -static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype +static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype ( vsi_nn_type_e dtype ) @@ -516,7 +518,7 @@ static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype return I8; } /* vsi_nn_kernel_map_dtype() */ -static inline vsi_nn_type_e vsi_nn_dtype_map_kernel +static VSI_INLINE_API vsi_nn_type_e vsi_nn_dtype_map_kernel ( vsi_nn_kernel_dtype_e dtype ) @@ -556,7 +558,7 @@ static inline vsi_nn_type_e vsi_nn_dtype_map_kernel return VSI_NN_TYPE_INT8; } /* vsi_nn_kernel_map_dtype() */ -static inline size_t vsi_nn_kernel_dtype_get_bytes +static VSI_INLINE_API size_t vsi_nn_kernel_dtype_get_bytes ( vsi_nn_kernel_dtype_e dtype ) @@ -585,7 +587,7 @@ static inline size_t vsi_nn_kernel_dtype_get_bytes return 0; } /* vsi_nn_kernel_dtype_get_bytes() */ -static inline vsi_size_t vsi_nn_kernel_dtype_get_bits +static VSI_INLINE_API vsi_size_t vsi_nn_kernel_dtype_get_bits ( vsi_nn_kernel_dtype_e dtype ) @@ -617,7 +619,7 @@ static inline vsi_size_t vsi_nn_kernel_dtype_get_bits return 0; } /* vsi_nn_kernel_dtype_get_bits() */ -static inline vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type +static VSI_INLINE_API vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type ( vsi_nn_qnt_type_e quant_type ) { switch( quant_type ) @@ -658,7 +660,7 @@ vsi_nn_kernel_scalar_t vsi_nn_kernel_scalar_create const void * data ); -static inline void vsi_nn_kernel_scalar_release +static VSI_INLINE_API void vsi_nn_kernel_scalar_release ( vsi_nn_kernel_scalar_t * scalar ) { if( scalar && *scalar ) @@ -803,7 +805,7 @@ vsi_status vsi_nn_kernel_tensor_write size_t size ); -static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size +static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_size ( const vsi_nn_kernel_tensor_attr_t * attr ) { if( !attr ) @@ -813,7 +815,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size return vsi_nn_shape_get_size( attr->shape->data, (vsi_size_t)attr->shape->size ); } /* vsi_nn_kernel_tensor_attr_get_size() */ -static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes +static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes ( const vsi_nn_kernel_tensor_attr_t * attr ) { vsi_size_t i = 0; @@ -851,7 +853,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes return bytes; } /* vsi_nn_kernel_tensor_attr_get_bytes() */ -static inline void vsi_nn_kernel_tensor_attr_get_stride +static VSI_INLINE_API void vsi_nn_kernel_tensor_attr_get_stride ( const vsi_nn_kernel_tensor_attr_t * attr, vsi_size_t * out_stride) { vsi_size_t type_bits; @@ -902,7 +904,7 @@ static inline void vsi_nn_kernel_tensor_attr_get_stride } } /* vsi_nn_kernel_tensor_attr_get_size() */ -static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized +static VSI_INLINE_API vsi_bool vsi_nn_kernel_tensor_attr_is_quantized ( const vsi_nn_kernel_tensor_attr_t * attr ) { return ( attr && attr->quant > VSI_NN_KERNEL_QUANT_NONE @@ -1072,7 +1074,7 @@ OVXLIB_API vsi_status vsi_nn_KernelGpuConfig const gpu_param_t * gpu_param ); -static inline const char* vsi_nn_kernel_type_str +static VSI_INLINE_API const char* vsi_nn_kernel_type_str ( vsi_nn_kernel_type_e type ) @@ -1095,7 +1097,7 @@ static inline const char* vsi_nn_kernel_type_str return "None"; } /* vsi_nn_kernel_type_str() */ -static inline vsi_status vsi_nn_kernel_unpack_4bit_data +static VSI_INLINE_API vsi_status vsi_nn_kernel_unpack_4bit_data ( const vsi_nn_kernel_tensor_attr_t * attr, uint8_t * src, @@ -1162,7 +1164,7 @@ static inline vsi_status vsi_nn_kernel_unpack_4bit_data return status; } -static inline vsi_status vsi_nn_kernel_pack_4bit_data +static VSI_INLINE_API vsi_status vsi_nn_kernel_pack_4bit_data ( const vsi_nn_kernel_tensor_attr_t * attr, uint8_t * src, diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h index 53c4969..c872cca 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h @@ -46,6 +46,8 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum VSI_NN_KERNEL_LUT_CLIP = 12, VSI_NN_KERNEL_LUT_SQUARE = 13, VSI_NN_KERNEL_LUT_CELU = 14, + VSI_NN_KERNEL_LUT_RCP = 15, + VSI_NN_KERNEL_LUT_SOFTSIGN = 16, }; #define VSI_NN_KERNEL_LUT_MAX_SIZE (1024) diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h b/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h index 5a74974..3a9e98b 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_crop.h @@ -30,11 +30,20 @@ extern "C" { #endif +typedef struct _vsi_nn_crop_lcl_data +{ + vx_int32 begin_dims[VSI_NN_MAX_DIM_NUM]; + vx_int32 end_dims[VSI_NN_MAX_DIM_NUM]; + vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM]; +} vsi_nn_crop_lcl_data; + typedef struct _vsi_nn_crop_param { int32_t axis; uint32_t dims; uint32_t offset[VSI_NN_MAX_DIM_NUM]; + + vsi_nn_crop_lcl_data *lcl_data; } vsi_nn_crop_param; #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_cumsum.h b/src/tim/vx/internal/include/ops/vsi_nn_op_cumsum.h new file mode 100644 index 0000000..f016884 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_cumsum.h @@ -0,0 +1,45 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CUMSUM_H +#define _VSI_NN_OP_CUMSUM_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_cumsum_param +{ + int32_t axis; + vsi_bool exclusive; + vsi_bool reverse; +} vsi_nn_cumsum_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h new file mode 100644 index 0000000..043d9e0 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_max_pool3d.h @@ -0,0 +1,55 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_MAX_POOL3D_H +#define _VSI_NN_OP_MAX_POOL3D_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_max_pool3d_param +{ + struct _max_pool3d_local_data_t* local; + // Add parameters here + + /* round_type is used to calculate the output shape */ + vsi_nn_round_type_e round_type; + uint32_t ksize[3]; + uint32_t stride[3]; + /* Pad left, right, top, bottom */ + uint32_t pad[6]; + /* Pad type default value shall be AUTO */ + vsi_nn_pad_e pad_type; +} vsi_nn_max_pool3d_param; +_compiler_assert(offsetof(vsi_nn_max_pool3d_param, local) == 0, \ + vsi_nn_max_pool3d_h ); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_mod.h b/src/tim/vx/internal/include/ops/vsi_nn_op_mod.h new file mode 100644 index 0000000..9d8d73c --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_mod.h @@ -0,0 +1,44 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_MOD_H +#define _VSI_NN_OP_MOD_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_mod_param +{ + int32_t fmod; +} vsi_nn_mod_param; + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rcp.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rcp.h new file mode 100644 index 0000000..201762b --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rcp.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_RCP_H +#define _VSI_NN_OP_RCP_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_rcp_param +{ + struct _rcp_local_data_t* local; + // Add parameters here +} vsi_nn_rcp_param; +_compiler_assert(offsetof(vsi_nn_rcp_param, local) == 0, \ + vsi_nn_rcp_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_sign.h b/src/tim/vx/internal/include/ops/vsi_nn_op_sign.h new file mode 100644 index 0000000..f596802 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_sign.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_SIGN_H +#define _VSI_NN_OP_SIGN_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_sign_param +{ + struct _sign_local_data_t* local; + // Add parameters here +} vsi_nn_sign_param; +_compiler_assert(offsetof(vsi_nn_sign_param, local) == 0, \ + vsi_nn_sign_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_softsign.h b/src/tim/vx/internal/include/ops/vsi_nn_op_softsign.h new file mode 100644 index 0000000..97bf611 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_softsign.h @@ -0,0 +1,47 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#ifndef _VSI_NN_OP_SOFTSIGN_H +#define _VSI_NN_OP_SOFTSIGN_H + +#include "vsi_nn_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct _vsi_nn_softsign_param +{ + struct _softsign_local_data_t* local; + // Add parameters here +} vsi_nn_softsign_param; +_compiler_assert(offsetof(vsi_nn_softsign_param, local) == 0, \ + vsi_nn_softsign_h ); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h index 4586fa8..4e19fc0 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -32,7 +32,7 @@ extern "C" { #endif -static inline vsi_bool type_is_integer +static VSI_INLINE_API vsi_bool type_is_integer ( const vsi_nn_type_e type ) @@ -60,7 +60,7 @@ static inline vsi_bool type_is_integer return ret; } /* type_is_integer() */ -static inline vsi_bool type_is_signed +static VSI_INLINE_API vsi_bool type_is_signed ( const vsi_nn_type_e type ) @@ -86,7 +86,7 @@ static inline vsi_bool type_is_signed return ret; } /* type_is_signed() */ -static inline uint32_t type_get_bytes +static VSI_INLINE_API uint32_t type_get_bytes ( const vsi_nn_type_e type ) @@ -115,7 +115,7 @@ static inline uint32_t type_get_bytes } } /* type_get_bytes() */ -static inline uint32_t type_get_bits +static VSI_INLINE_API uint32_t type_get_bits ( const vsi_nn_type_e type ) @@ -147,7 +147,7 @@ static inline uint32_t type_get_bits } } /* type_get_bits() */ -static inline void type_get_range +static VSI_INLINE_API void type_get_range ( vsi_nn_type_e type, double * max_range, @@ -186,7 +186,24 @@ static inline void type_get_range } } /* type_get_range() */ -static inline int32_t fp32_to_affine +static VSI_INLINE_API vsi_bool fp32_is_inf + ( + float val + ) +{ + uint32_t u_value = *(uint32_t*)&val; + + if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF) + { + return TRUE; + } + else + { + return FALSE; + } +} + +static VSI_INLINE_API int32_t fp32_to_affine ( const float in, const float scale, @@ -200,10 +217,17 @@ static inline int32_t fp32_to_affine type_get_range( type, &max_range, &min_range ); data = (int32_t)(vsi_rint( in / scale ) + zero_point ); data = vsi_nn_max( (int32_t)min_range, vsi_nn_min( (int32_t)max_range , data ) ); + + if (fp32_is_inf(in) != 0) + { + uint32_t sign = (*(uint32_t*)&in) >> 31; + data = sign == 1 ? (int32_t)min_range : (int32_t)max_range; + } + return data; } /* fp32_to_affine() */ -static inline float affine_to_fp32 +static VSI_INLINE_API float affine_to_fp32 ( const int32_t val, const float scale, @@ -216,7 +240,7 @@ static inline float affine_to_fp32 return data; } /* affine_to_fp32() */ -static inline int32_t fp32_to_dfp +static VSI_INLINE_API int32_t fp32_to_dfp ( const float in, const int8_t fl, @@ -237,10 +261,17 @@ static inline int32_t fp32_to_dfp } data = vsi_nn_min( data, (int32_t)max_range ); data = vsi_nn_max( data, (int32_t)min_range ); + + if (fp32_is_inf(in) != 0) + { + uint32_t sign = (*(uint32_t*)&in) >> 31; + data = sign == 1 ? (int32_t)min_range : (int32_t) max_range; + } + return data; } /* fp32_to_dfp() */ -static inline float dfp_to_fp32 +static VSI_INLINE_API float dfp_to_fp32 ( const int32_t val, const int8_t fl, @@ -259,7 +290,7 @@ static inline float dfp_to_fp32 return result; } /* dfp_to_fp32() */ -static inline vsi_status integer_convert +static VSI_INLINE_API vsi_status integer_convert ( const void * src, vsi_nn_type_e src_type, @@ -303,7 +334,7 @@ typedef union float f; } _fp32_t; -static inline float fp16_to_fp32 +static VSI_INLINE_API float fp16_to_fp32 ( int16_t in ) @@ -323,7 +354,7 @@ static inline float fp16_to_fp32 return o.f; } /* fp16_to_fp32() */ -static inline float bfp16_to_fp32 +static VSI_INLINE_API float bfp16_to_fp32 ( int16_t in ) @@ -344,7 +375,7 @@ static inline float bfp16_to_fp32 return t3 == 0 ? 0 : out; } /* bfp16_to_fp32() */ -static inline uint16_t fp32_to_fp16 +static VSI_INLINE_API uint16_t fp32_to_fp16 ( float in ) @@ -370,7 +401,7 @@ static inline uint16_t fp32_to_fp16 return (uint16_t) fp16; } /* fp32_to_fp16() */ -static inline uint16_t fp32_to_bfp16 +static VSI_INLINE_API uint16_t fp32_to_bfp16 ( float in ) @@ -381,7 +412,7 @@ static inline uint16_t fp32_to_bfp16 return (uint16_t) t1; } /* fp32_to_bfp16() */ -static inline uint16_t fp32_to_bfp16_rtne +static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne ( float in ) @@ -409,7 +440,7 @@ static inline uint16_t fp32_to_bfp16_rtne return out; } /* fp32_to_bfp16_rtne */ -static inline vsi_status dtype_to_float32 +static VSI_INLINE_API vsi_status dtype_to_float32 ( uint8_t *src, float *dst, @@ -461,7 +492,7 @@ static inline vsi_status dtype_to_float32 return VSI_SUCCESS; } -static inline vsi_status float32_to_dtype +static VSI_INLINE_API vsi_status float32_to_dtype ( float src, uint8_t *dst, diff --git a/src/tim/vx/internal/include/utils/vsi_nn_math.h b/src/tim/vx/internal/include/utils/vsi_nn_math.h index 18ea5e8..b8a6d2a 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_math.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h @@ -42,6 +42,8 @@ extern "C" { #define vsi_clamp(x, min, max) vsi_nn_clamp(x, min, max) #define vsi_rtne(x) vsi_rint(x) +#define VSI_NN_INT32_MAX (0x7FFFFFFF) + #define VSI_NN_FLOAT32_INF (0x7F800000) #define VSI_NN_FLOAT32_NAN (0x7FC00000) #define VSI_NN_FLOAT64_INF (0x7FF0000000000000) @@ -53,14 +55,14 @@ extern "C" { size_t size; \ TYPE data[0]; \ } vsi_##NAME##_array_t; \ - static inline vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \ + static VSI_INLINE_API vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \ vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \ sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \ if (array == NULL) return NULL; \ array->size = size; \ return array; \ } \ - static inline void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \ + static VSI_INLINE_API void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \ { \ if( array && *array ) { \ free( *array ); \ @@ -167,7 +169,7 @@ void vsi_nn_random_uniform_transform uint32_t len ); -static inline double copy_sign +static VSI_INLINE_API double copy_sign ( double number, double sign @@ -177,7 +179,7 @@ static inline double copy_sign return (sign > 0) ? value : (-value); } /* copy_sign() */ -static inline float simple_round +static VSI_INLINE_API float simple_round ( float x ) @@ -185,7 +187,7 @@ static inline float simple_round return (float) copy_sign(floorf(fabsf(x) + 0.5f), x); } /* simple_round() */ -static inline double vsi_rint +static VSI_INLINE_API double vsi_rint ( double x ) diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h index 9fb03d9..77b3cb6 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -65,7 +65,7 @@ extern "C" { #define VSI_NN_DO_JOIN(X, Y) VSI_NN_DO_JOIN2(X,Y) #define VSI_NN_DO_JOIN2(X, Y) X##Y -#if defined(_MSC_VER) +#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) #define VSI_NN_DEPRECATED(symbol, hints) \ __declspec(deprecated(VSI_NN_STRINGIZE(hints))) symbol @@ -381,7 +381,7 @@ int32_t vsi_nn_partition * @param[in] num Number of tensors. * @param[out] out_tensors Ordered tensors * */ -static inline void vsi_nn_reorder_tensor +static VSI_INLINE_API void vsi_nn_reorder_tensor ( vsi_nn_tensor_t** tensors, const int32_t* order, @@ -417,6 +417,15 @@ vsi_bool vsi_nn_is_broadcast_operaton vsi_nn_tensor_t * output ); +vsi_bool vsi_nn_is_broadcast_axes_operaton + ( + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t * output, + int32_t * axis, + int32_t axis_num + ); + float vsi_nn_get_tensor_scale ( vsi_nn_tensor_t * tensor diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h index 20a4dd1..95591ca 100644 --- a/src/tim/vx/internal/include/vsi_nn_context.h +++ b/src/tim/vx/internal/include/vsi_nn_context.h @@ -66,6 +66,8 @@ typedef struct _vsi_nn_hw_config_t uint32_t use_40bits_va; uint32_t support_stream_processor; uint32_t sp_exec_count; + uint32_t sp_vector_depth; + uint32_t sp_per_core_vector_depth; } vsi_nn_hw_config_t; typedef struct _vsi_nn_runtime_option_t diff --git a/src/tim/vx/internal/include/vsi_nn_daemon.h b/src/tim/vx/internal/include/vsi_nn_daemon.h index e005466..4fad88c 100644 --- a/src/tim/vx/internal/include/vsi_nn_daemon.h +++ b/src/tim/vx/internal/include/vsi_nn_daemon.h @@ -35,7 +35,7 @@ struct f##_t_{ ~f##_t_(void) { f(); }}; static f##_t_ f##_; \ static void f(void) -#elif defined(_MSC_VER) +#elif (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) #pragma section(".CRT$XCU", read) #define _INITIALIZER2(f, p) \ static void f(void); \ diff --git a/src/tim/vx/internal/include/vsi_nn_feature.h b/src/tim/vx/internal/include/vsi_nn_feature.h index 2ebb367..7297269 100644 --- a/src/tim/vx/internal/include/vsi_nn_feature.h +++ b/src/tim/vx/internal/include/vsi_nn_feature.h @@ -27,7 +27,7 @@ #include "vsi_nn_types.h" #include "vsi_nn_prv.h" -static inline vsi_bool vsi_nn_feature_conv_max_kernel_size() +static VSI_INLINE_API vsi_bool vsi_nn_feature_conv_max_kernel_size() { return 11; } diff --git a/src/tim/vx/internal/include/vsi_nn_log.h b/src/tim/vx/internal/include/vsi_nn_log.h index d3afaa2..d8b5bad 100644 --- a/src/tim/vx/internal/include/vsi_nn_log.h +++ b/src/tim/vx/internal/include/vsi_nn_log.h @@ -31,7 +31,7 @@ extern "C"{ #endif -#ifdef _MSC_VER +#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) #define snprintf(buffer, count, format, ...) \ _snprintf_s(buffer, count, _TRUNCATE, format, ##__VA_ARGS__) #define vsnprintf(buffer, count, format, args) \ diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index 16f74fa..5c170df 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -190,6 +190,12 @@ #include "ops/vsi_nn_op_gather_elements.h" #include "ops/vsi_nn_op_selu.h" #include "ops/vsi_nn_op_celu.h" +#include "ops/vsi_nn_op_max_pool3d.h" +#include "ops/vsi_nn_op_rcp.h" +#include "ops/vsi_nn_op_sign.h" +#include "ops/vsi_nn_op_softsign.h" +#include "ops/vsi_nn_op_cumsum.h" +#include "ops/vsi_nn_op_mod.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" @@ -365,6 +371,12 @@ typedef union _vsi_nn_nn_param vsi_nn_gather_elements_param gather_elements; vsi_nn_selu_param selu; vsi_nn_celu_param celu; + vsi_nn_max_pool3d_param max_pool3d; + vsi_nn_rcp_param rcp; + vsi_nn_sign_param sign; + vsi_nn_softsign_param softsign; + vsi_nn_cumsum_param cumsum; + vsi_nn_mod_param mod; void* client_param; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h index 124ac48..5cc2a3e 100644 --- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h +++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h @@ -243,6 +243,18 @@ OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam uint32_t enable_nodes_count ); +OVXLIB_API vsi_status vsi_nn_UpdateCropParamsForBinaryGraph + ( + vsi_nn_graph_t* graph, + uint32_t enabled_crop_input_idx, + uint32_t start_x, + uint32_t start_y, + uint32_t crop_w, + uint32_t crop_h, + uint32_t dst_w, + uint32_t dst_h + ); + #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_pub.h b/src/tim/vx/internal/include/vsi_nn_pub.h index 5e9194e..d36f570 100644 --- a/src/tim/vx/internal/include/vsi_nn_pub.h +++ b/src/tim/vx/internal/include/vsi_nn_pub.h @@ -26,7 +26,7 @@ #define _VSI_NN_PUB_H #if !defined(OVXLIB_API) - #if defined(_WIN32) + #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) #define OVXLIB_API __declspec(dllimport) #else #define OVXLIB_API __attribute__((visibility("default"))) diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h index 8aa3ca9..076f493 100644 --- a/src/tim/vx/internal/include/vsi_nn_types.h +++ b/src/tim/vx/internal/include/vsi_nn_types.h @@ -33,11 +33,13 @@ extern "C"{ #endif -#ifdef _WIN32 -#define inline __inline +#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) +#define VSI_INLINE_API __inline +#else +#define VSI_INLINE_API inline #endif -#if (defined(_MSC_VER) || defined(__MINGW32)) +#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) #define SIZE_T_SPECIFIER "Iu" #define SSIZE_T_SPECIFIER "Id" #ifdef VSI_40BIT_VA_SUPPORT @@ -59,7 +61,7 @@ extern "C"{ #endif #endif -#if defined(_MSC_VER) +#if (defined(_MSC_VER)) #include typedef SSIZE_T ssize_t; #else diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index faab685..711c498 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 43 +#define VSI_NN_VERSION_PATCH 50 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c index bf5b07c..5741690 100644 --- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c @@ -188,7 +188,7 @@ static vsi_status _query_kernel input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - if (input_dtype == I8) + if (input_dtype == I8 || input_dtype == I16) { input_dtype = I32; } @@ -269,7 +269,6 @@ static vsi_nn_kernel_node_t _setup /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); CHECK_STATUS_FAIL_GOTO( status, OnError ); - } } @@ -285,4 +284,3 @@ OnError: __END_DECLS REGISTER_BACKEND_CL( argmax, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c index 2911a84..b710fa1 100644 --- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c @@ -188,6 +188,11 @@ static vsi_status _query_kernel input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (input_dtype == I8 || input_dtype == I16) + { + input_dtype = I32; + } + if (output_dtype == I16) { output_dtype = I32; @@ -264,7 +269,6 @@ static vsi_nn_kernel_node_t _setup /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM ); CHECK_STATUS_FAIL_GOTO( status, OnError ); - } } diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c new file mode 100644 index 0000000..91746ab --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c @@ -0,0 +1,365 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_error.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +#define KERNEL_SOURCE_1 "cumsum" +#define KERNEL_SOURCE_2 "cumsum_2d" + +// Add kernel hashtable here +#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + +#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ + KERNEL_SOURCE_1 }, + +#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \ + KERNEL_SOURCE_2 }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } cumsum_map[] = +{ + HASH_CUMSUM_KERNELS(0, U8, U8) + HASH_CUMSUM_KERNELS(0, F32, F32) + HASH_CUMSUM_KERNELS(1, U8, U8) + HASH_CUMSUM_KERNELS(1, F32, F32) + HASH_CUMSUM_KERNELS(2, U8, U8) + HASH_CUMSUM_KERNELS(2, F32, F32) + HASH_CUMSUM_KERNELS_2D(0, U8, U8) + HASH_CUMSUM_KERNELS_2D(0, F32, F32) + HASH_CUMSUM_KERNELS_2D(1, U8, U8) + HASH_CUMSUM_KERNELS_2D(1, F32, F32) +}; + +/* + * Kernel params + */ +static vx_param_description_t _cumsum_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _CUMSUM_PARAM_NUM _cnt_of_array( _cumsum_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_cumsum_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; + vsi_size_array_t * input_shape = NULL; + int32_t axis = 0; + int32_t width = 0; + int32_t height = 0; + int32_t channel = 0; + int32_t w = 1; + int32_t h = 1; + int32_t c = 1; + uint32_t dim = 1; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, final ); + + input_shape = attr[0]->shape; + dim = (uint32_t)input_shape->size; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); + channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1); + + if (axis == 0) + { + w = 1; + h = height; + c = channel; + } + else if (axis == 1) + { + w = width; + h = 1; + c = channel; + } + else if (axis == 2) + { + w = width; + h = height; + c = 1; + } + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = w; + gpu_param.global_size[1] = h; + gpu_param.global_size[2] = c; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + CHECK_STATUS_FAIL_GOTO(status, final); + +final: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + return status; +} /* _cumsum_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t axis, + int32_t is_2d + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (input0_dtype == U32) + { + input0_dtype = U8; + } + + if (input0_dtype == F16) + { + input0_dtype = F32; + } + + if (output_dtype == U32) + { + output_dtype = U8; + } + + if (output_dtype == F16) + { + output_dtype = F32; + } + + key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d); + + for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ ) + { + if ( cumsum_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(cumsum_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", cumsum_map[i].function_name ); + kernel->info.parameters = _cumsum_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def ); + kernel->info.initialize = _cumsum_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + cumsum_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + cumsum_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_size_t shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t axis_new = 0; + int32_t is_2d = 0; + uint32_t rs_dim = 2; + int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float in_out_scale = input_scale * output_scale; + float in_out_zp_scale = in_out_scale * input_zp; + int32_t width = 0; + int32_t height = 0; + int32_t channel = 1; + int32_t i = 0; + + vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rs_dim, &axis_new); + if (rs_dim > 3) + { + return NULL; + } + + width = (int32_t)shapes[0][0]; + height = (int32_t)shapes[0][1]; + + if (rs_dim == 2) + { + is_2d = 1; + } + else + { + channel = (int32_t)shapes[0][2]; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], (vsi_size_t)rs_dim ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[0], (vsi_size_t)rs_dim ); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 2; + + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( node_params, _CUMSUM_PARAM_NUM, + reshape_tensors, 1, &reshape_tensors[1], 1 ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &channel ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_zp_scale ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _CUMSUM_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[2] ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + } + } + + for (i = 0; i < 2; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( cumsum, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c index 7bf6d36..7e1d681 100644 --- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -53,6 +53,9 @@ typedef enum UNARY_HGELU, UNARY_SELU, UNARY_CELU, + UNARY_RCP, + UNARY_SIGN, + UNARY_SOFTSIGN, } unary_type_e; /* @@ -94,6 +97,13 @@ typedef enum #define HGELU_OPERATION hard_gelu #define SELU_OPERATION selu #define CELU_OPERATION celu +#define RCP_OPERATION rcp +#define SIGN_OPERATION sign +#define SOFTSIGN_OPERATION softsign + +#define ADD_UNARY_SH_KERNELS(name, src_type, dst_type) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, src_type, dst_type) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, src_type, dst_type) static const struct { uint32_t key; @@ -101,61 +111,39 @@ static const struct { const char* source_name; } kernel_map[] = { - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F32, F32) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F32, F32) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F32, F32) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F32, F32) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F32, F32) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F32, F32) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F32, F32) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F32, F32) - TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION, UNARY_HGELU, F32, F32) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F32, F32) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F32, F32) + ADD_UNARY_SH_KERNELS(SIN, F32, F32) + ADD_UNARY_SH_KERNELS(COS, F32, F32) + ADD_UNARY_SH_KERNELS(EXP, F32, F32) + ADD_UNARY_SH_KERNELS(LOG, F32, F32) + ADD_UNARY_SH_KERNELS(NEG, F32, F32) + ADD_UNARY_SH_KERNELS(HSIGMOID, F32, F32) + ADD_UNARY_SH_KERNELS(MISH, F32, F32) + ADD_UNARY_SH_KERNELS(ROUND, F32, F32) + ADD_UNARY_SH_KERNELS(GELU, F32, F32) + ADD_UNARY_SH_KERNELS(HGELU, F32, F32) + ADD_UNARY_SH_KERNELS(SELU, F32, F32) + ADD_UNARY_SH_KERNELS(CELU, F32, F32) + ADD_UNARY_SH_KERNELS(RCP, F32, F32) + ADD_UNARY_SH_KERNELS(SIGN, F32, F32) + ADD_UNARY_SH_KERNELS(SOFTSIGN, F32, F32) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F32, F32) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F32, F32) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F32, F32) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F32, F32) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F32, F32) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F32, F32) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F32, F32) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F32, F32) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F32, F32) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F32, F32) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F32, F32) + ADD_UNARY_SH_KERNELS(SIN, U8, U8) + ADD_UNARY_SH_KERNELS(COS, U8, U8) + ADD_UNARY_SH_KERNELS(EXP, U8, U8) + ADD_UNARY_SH_KERNELS(LOG, U8, U8) + ADD_UNARY_SH_KERNELS(NEG, U8, U8) + ADD_UNARY_SH_KERNELS(HSIGMOID, U8, U8) + ADD_UNARY_SH_KERNELS(MISH, U8, U8) + ADD_UNARY_SH_KERNELS(ROUND, U8, U8) + ADD_UNARY_SH_KERNELS(GELU, U8, U8) + ADD_UNARY_SH_KERNELS(HGELU, U8, U8) + ADD_UNARY_SH_KERNELS(SELU, U8, U8) + ADD_UNARY_SH_KERNELS(CELU, U8, U8) + ADD_UNARY_SH_KERNELS(RCP, U8, U8) + ADD_UNARY_SH_KERNELS(SIGN, U8, U8) + ADD_UNARY_SH_KERNELS(SOFTSIGN, U8, U8) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, U8, U8) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, U8, U8) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, U8, U8) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, U8, U8) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, U8, U8) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8, U8) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8, U8) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8, U8) - TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION, UNARY_HGELU, U8, U8) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8, U8) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8, U8) - - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8, U8) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8, U8) - - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32) - - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32) + ADD_UNARY_SH_KERNELS(NEG, I32, I32) }; #undef SIN_OPERATION @@ -170,6 +158,9 @@ static const struct { #undef HGELU_OPERATION #undef SELU_OPERATION #undef CELU_OPERATION +#undef RCP_OPERATION +#undef SIGN_OPERATION +#undef SOFTSIGN_OPERATION /* * Kernel params */ @@ -458,4 +449,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu, UNARY_GELU ) REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu, UNARY_HGELU ) REGISTER_ELTWISE_UNARY_BACKEND_CL( selu, UNARY_SELU ) REGISTER_ELTWISE_UNARY_BACKEND_CL( celu, UNARY_CELU ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp, UNARY_RCP ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( sign, UNARY_SIGN ) +REGISTER_ELTWISE_UNARY_BACKEND_CL( softsign, UNARY_SOFTSIGN ) + __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c index fdeda2e..f04c62f 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c @@ -123,7 +123,7 @@ static vsi_status cal_gather_tensor_reshape_size uint32_t i = 0; vsi_size_t elementCnt = 1; vsi_size_t outerCnt = 1; -#define VSI_NN_MAX_IMAGE_WIDTH (65536) +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH for (i = 0; i < dims_num - batch_dims; ++i) { @@ -365,4 +365,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( gather, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c index 4612e4f..74dd993 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c @@ -111,7 +111,7 @@ static vsi_status cal_gather_nd_tensor_reshape_size vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; vsi_size_t elementCnt = 1; -#define VSI_NN_MAX_IMAGE_WIDTH (65536) +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH newDim[0] = 0; for(i = 0; i < dims_num; ++i) @@ -336,4 +336,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( gather_nd, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c index 58eb2ee..892377b 100644 --- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -44,21 +43,20 @@ __BEGIN_DECLS */ typedef enum { - INTERNAL_KERNEL_MEAN_VARI, + INTERNAL_KERNEL_SUMS, INTERNAL_KERNEL_NORM, } _internal_kernel_e; #define KERNEL_SOURCE_1 "instance_normalization_u8" -#define KERNEL_SOURCE_2 "instance_normalization_f16" +#define KERNEL_SOURCE_2 "instance_normalization_f32" #define KERNEL_SOURCE_3 "instance_normalization_i32" -#define KERNEL_SOURCE_4 "instance_normalization_f32" // Add kernel hashtable here -#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(SRC0_TYPE) \ - CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE) +#define HASH_INSTANCENORM_SUMS_KERNEL_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE) -#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(SRC0_TYPE) \ - CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE"_2D") +#define HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE"_2D") #define HASH_INSTANCENORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("cl.instance_norm_"#SRC0_TYPE"to"#DST_TYPE) @@ -68,17 +66,17 @@ typedef enum // Add kernel hashtable here // mean vari -#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \ +#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \ ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) -#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 0), \ - HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(IN0_TYPE), \ +#define TENSOR_INSTANCENORM_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_INSTANCENORM_SUMS_KERNEL_NAME(IN0_TYPE), \ SOURCE }, -#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 1), \ - HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(IN0_TYPE), \ +#define TENSOR_INSTANCENORM_SUMS_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(IN0_TYPE), \ SOURCE }, // normalization @@ -102,17 +100,15 @@ typedef struct const char * source_name; } _kernel_map_type; -static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] = +static const _kernel_map_type _instancenorm_sums_kernel_map[] = { // Register kernel here - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( U8, F32, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I32, F32, KERNEL_SOURCE_3 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F32, F32, KERNEL_SOURCE_4 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 ) + TENSOR_INSTANCENORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_SUMS_KERNELS( F32, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SUMS_KERNELS( I32, F32, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 ) }; static const _kernel_map_type _instancenorm_kernel_map[] = @@ -123,22 +119,19 @@ static const _kernel_map_type _instancenorm_kernel_map[] = TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_1 ) TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 ) TENSOR_INSTANCENORM_KERNELS( I32, I32, KERNEL_SOURCE_3 ) TENSOR_INSTANCENORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 ) TENSOR_INSTANCENORM_KERNELS( I32, F32, KERNEL_SOURCE_3 ) TENSOR_INSTANCENORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 ) - - TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_4 ) - TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 ) }; /* * Kernel params */ -static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] = +static vx_param_description_t _instancenorm_sums_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -146,12 +139,9 @@ static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; -#define _INSTANCENORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _instancenorm_mean_vari_kernel_param_def ) +#define _INSTANCENORM_SUMS_PARAM_NUM _cnt_of_array( _instancenorm_sums_kernel_param_def ) static vx_param_description_t _instancenorm_kernel_param_def[] = { @@ -168,10 +158,6 @@ static vx_param_description_t _instancenorm_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; #define _INSTANCENORM_PARAM_NUM _cnt_of_array( _instancenorm_kernel_param_def ) @@ -179,7 +165,7 @@ static vx_param_description_t _instancenorm_kernel_param_def[] = /* * Kernel initializer */ -DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) +DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -244,7 +230,7 @@ final: attr[1] = NULL; } return status; -} /* _instance_normalization_mean_vari_initializer() */ +} /* _instance_normalization_sums_initializer() */ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) ( @@ -334,12 +320,12 @@ static vsi_status _query_kernel switch( kernel_id ) { - case INTERNAL_KERNEL_MEAN_VARI: - initializer = _instancenorm_mean_vari_initializer; - kernel_map = _instancenorm_mean_vari_kernel_map; - kernel_map_size = _cnt_of_array( _instancenorm_mean_vari_kernel_map ); - param_def = _instancenorm_mean_vari_kernel_param_def; - param_size = _INSTANCENORM_MEAN_VARI_PARAM_NUM; + case INTERNAL_KERNEL_SUMS: + initializer = _instancenorm_sums_initializer; + kernel_map = _instancenorm_sums_kernel_map; + kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map ); + param_def = _instancenorm_sums_kernel_param_def; + param_size = _INSTANCENORM_SUMS_PARAM_NUM; break; case INTERNAL_KERNEL_NORM: initializer = _instancenorm_initializer; @@ -392,9 +378,9 @@ static vsi_nn_kernel_node_t _setup ) { #define INTERNAL_KERNEL_SIZE (1) -#define MEAN_VARI_INDEX (0) +#define SUMS_INDEX (0) vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t mean_vari_node_params[_INSTANCENORM_MEAN_VARI_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_dtype_e in0_dtype = U8; @@ -407,18 +393,17 @@ static vsi_nn_kernel_node_t _setup uint32_t hashkey = 0; int32_t i = 0; uint32_t rank = outputs[0]->attr.dim_num; - float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ) / + (input_scale * input_scale); size_t width = inputs[0]->attr.size[0]; size_t height = inputs[0]->attr.size[1]; int32_t reshape_flg = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH && rank > 2; int32_t group_num = (int32_t)(width + 15) / 16; - int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]); - float input_scale = vsi_nn_get_tensor_scale(inputs[0]); int32_t output_zp = vsi_nn_get_tensor_zero_point(outputs[0]); float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); - float in_fl_scale = 1.0f, out_fl_scale = 1.0; - float dim_ratio = (float)1.0 / (float)(width * height); + float inv_multiplier = (float)1.0 / (float)(width * height); if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) @@ -443,15 +428,21 @@ static vsi_nn_kernel_node_t _setup attr.size[2] = 1; attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; attr.dim_num = 4; - tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr ); + tensors[SUMS_INDEX] = vsi_nn_CreateTensor( graph, &attr ); in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + in0_dtype = in0_dtype == F16 ? F32 : in0_dtype; + in0_dtype = in0_dtype == I8 ? I32 : in0_dtype; + in0_dtype = in0_dtype == I16 ? I32 : in0_dtype; + out_dtype = out_dtype == F16 ? F32 : out_dtype; + out_dtype = out_dtype == I8 ? I32 : out_dtype; + out_dtype = out_dtype == I16 ? I32 : out_dtype; - hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg ); + hashkeys[SUMS_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg ); hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg ); - status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); + status = _query_kernel( ikernels[SUMS_INDEX], hashkeys[SUMS_INDEX], INTERNAL_KERNEL_SUMS ); if ( VSI_SUCCESS != status ) { goto final; @@ -497,37 +488,31 @@ static vsi_nn_kernel_node_t _setup } // Mean Vari { - node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] ); + node = vsi_nn_kernel_create_node( graph, ikernels[SUMS_INDEX] ); if (node) { uint32_t index = 0; if (reshape_flg) { - mean_vari_node_params[index++] = rs_input; + sums_node_params[index++] = rs_input; } else { - mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; + sums_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; } - mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp ); - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale ); - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + sums_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t; + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); - status = vsi_nn_kernel_node_pass_param( node, mean_vari_node_params, - _INSTANCENORM_MEAN_VARI_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( node, sums_node_params, + _INSTANCENORM_SUMS_PARAM_NUM ); CHECK_STATUS(status); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] ); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] ); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] ); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[5] ); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[6] ); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[7] ); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[8] ); + vsi_nn_kernel_scalar_release( &sums_node_params[2] ); + vsi_nn_kernel_scalar_release( &sums_node_params[3] ); + vsi_nn_kernel_scalar_release( &sums_node_params[4] ); + vsi_nn_kernel_scalar_release( &sums_node_params[5] ); vsi_nn_kernel_node_release( &node ); } } @@ -562,7 +547,7 @@ static vsi_nn_kernel_node_t _setup { node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; } - node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; + node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t; if (reshape_flg) { node_params[index++] = rs_output; @@ -573,15 +558,11 @@ static vsi_nn_kernel_node_t _setup } node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &out_fl_scale ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); - node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inv_multiplier ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_num ); status = vsi_nn_kernel_node_pass_param( node, node_params, @@ -595,10 +576,6 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[10] ); vsi_nn_kernel_scalar_release( &node_params[11] ); vsi_nn_kernel_scalar_release( &node_params[12] ); - vsi_nn_kernel_scalar_release( &node_params[13] ); - vsi_nn_kernel_scalar_release( &node_params[14] ); - vsi_nn_kernel_scalar_release( &node_params[15] ); - vsi_nn_kernel_scalar_release( &node_params[16] ); } } diff --git a/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c new file mode 100644 index 0000000..2311810 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/maxpoolwithargmax_cl.c @@ -0,0 +1,312 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +#define KERNEL_SOURCE_1 "maxpoolwithargmax" +#define KERNEL_SOURCE_2 "maxpoolwithargmax_2d" + +// Add kernel hashtable here +#define MAXPOOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, _image_2d) \ + (( IN_DTYPE << 24 ) | ( OUT_DTYPE0 << 20) | ( OUT_DTYPE1 << 12) | (_image_2d)) + +#define HASH_MAXPOOLWITHARGMAX_KERNELS( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \ + { MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 0), \ + CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1), \ + KERNEL_SOURCE_1 }, + +#define HASH_MAXPOOLWITHARGMAX_KERNELS_2D( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \ + { MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 1), \ + CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1"_2D"), \ + KERNEL_SOURCE_2 }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } maxpoolwithargmax_map[] = +{ + HASH_MAXPOOLWITHARGMAX_KERNELS(F32, F32, I32) + HASH_MAXPOOLWITHARGMAX_KERNELS(BF16, BF16, I32) + HASH_MAXPOOLWITHARGMAX_KERNELS(U32, U32, I32) + HASH_MAXPOOLWITHARGMAX_KERNELS(I32, I32, I32) + HASH_MAXPOOLWITHARGMAX_KERNELS_2D(F32, F32, I32) + HASH_MAXPOOLWITHARGMAX_KERNELS_2D(BF16, BF16, I32) + HASH_MAXPOOLWITHARGMAX_KERNELS_2D(U32, U32, I32) + HASH_MAXPOOLWITHARGMAX_KERNELS_2D(I32, I32, I32) +}; + +/* + * Kernel params + */ +static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _MAXPOOLWITHARGMAX_PARAM_NUM _cnt_of_array( _maxpoolwithargmax_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_maxpoolwithargmax_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + + vx_status status = VX_FAILURE; + vx_tensor output = (vx_tensor)param[1]; + vsi_nn_kernel_tensor_attr_t * attr_out = NULL; + vsi_size_array_t * out_shape = NULL; + + attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + out_shape = attr_out->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = out_shape->data[1]; + gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (attr_out) + { + vsi_nn_kernel_tensor_attr_release(&attr_out); + } + + return status; +} /* _maxpoolwithargmax_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + int32_t is_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input_dtype = U8; + vsi_nn_kernel_dtype_e output0_dtype = U8; + vsi_nn_kernel_dtype_e output1_dtype = I32; + uint32_t key = 0; + int32_t i = 0; + + input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + output1_dtype = vsi_nn_kernel_map_dtype( outputs[1]->attr.dtype.vx_type ); + + if (input_dtype == U8) + { + input_dtype = U32; + } + + if (input_dtype == I8 || input_dtype == I16) + { + input_dtype = I32; + } + + if (input_dtype == F16) + { + input_dtype = F32; + } + + if (output0_dtype == U8) + { + output0_dtype = U32; + } + + if (output0_dtype == I8 || output0_dtype == I16) + { + output0_dtype = I32; + } + + if (output0_dtype == F16) + { + output0_dtype = F32; + } + + key = MAXPOOLWITHARGMAX_HASH_KEY( input_dtype, output0_dtype, output1_dtype, is_2d); + + for ( i = 0; i < _cnt_of_array(maxpoolwithargmax_map); i ++ ) + { + if ( maxpoolwithargmax_map[i].key == key ) + { + break; + } + } + + if ( i < _cnt_of_array(maxpoolwithargmax_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", maxpoolwithargmax_map[i].function_name ); + kernel->info.parameters = _maxpoolwithargmax_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _maxpoolwithargmax_kernel_param_def ); + kernel->info.initialize = _maxpoolwithargmax_initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + maxpoolwithargmax_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + maxpoolwithargmax_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x"); + int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y"); + int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x"); + int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y"); + int32_t pad_x = vsi_nn_kernel_param_get_int32(params, "pad_left"); + int32_t pad_y = vsi_nn_kernel_param_get_int32(params, "pad_top"); + int32_t image_2d = inputs[0]->attr.dim_num == 2 ? 1 : 0; + int32_t width = (int32_t)inputs[0]->attr.size[0]; + int32_t height = (int32_t)inputs[0]->attr.size[1]; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float inputScale = vsi_nn_get_tensor_scale(inputs[0]); + float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float scale_value = 1.0f; + float tail_value = 0.0f; + + if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, + inputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) + || !vsi_nn_kernel_gpu_check_shape( outputs[1]->attr.size, + outputs[1]->attr.dim_num )) + { + return NULL; + } + + scale_value = inputScale / outputScale; + tail_value = outputTail - inputTail * inputScale / outputScale; + + status = _query_kernel( kernel, inputs, outputs, image_2d ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 3; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_value ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &tail_value ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + vsi_nn_kernel_scalar_release( &node_params[11] ); + vsi_nn_kernel_scalar_release( &node_params[12] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( maxpoolwithargmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/mod_cl.c b/src/tim/vx/internal/src/kernel/cl/mod_cl.c new file mode 100644 index 0000000..1398823 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cl/mod_cl.c @@ -0,0 +1,303 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +#define MOD_KERNEL_SOURCE_NAME "mod" + +#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + + +#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \ + MOD_KERNEL_SOURCE_NAME}, + +#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \ + MOD_KERNEL_SOURCE_NAME }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _mod_kernel_map[] = +{ + +// Register kernel here + MOD_KERNELS( F32, F32, F32 ) + MOD_KERNELS( I32, I32, I32 ) + MOD_KERNELS( I32, I32, U8 ) + MOD_KERNELS( U8, U8, U8 ) + MOD_KERNELS( U8, I32, U8 ) + + MOD_KERNELS_2D( F32, F32, F32 ) + MOD_KERNELS_2D( I32, I32, I32 ) + MOD_KERNELS_2D( I32, I32, U8 ) + MOD_KERNELS_2D( U8, U8, U8 ) + MOD_KERNELS_2D( U8, I32, U8 ) +}; + +/* + * Kernel params + */ +static vx_param_description_t _mod_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _MOD_PARAM_NUM _cnt_of_array( _mod_kernel_param_def ) +#define MOD_QUANT_PARAM_NUM _cnt_of_array( _mod_kernel_param_def ) +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_mod_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_status status = VSI_FAILURE; + vx_tensor output = (vx_tensor)param[2]; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_size_array_t *output_shape = NULL; + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + + return status; +} /* _mod_initializer() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _mod_kernel_map; + size_t kernel_map_size = _cnt_of_array( _mod_kernel_map ); + vx_param_description_t * param_def = _mod_kernel_param_def; + size_t param_def_size = _cnt_of_array( _mod_kernel_param_def ); + vx_kernel_initialize_f initializer = _mod_initializer; + + uint32_t key = 0; + uint32_t i = 0; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + if (F16 == in0_dtype) + { + in0_dtype = F32; + } + else if (I16 == in0_dtype || I8 == in0_dtype) + { + in0_dtype = I32; + } + + if (F16 == in1_dtype) + { + in1_dtype = F32; + } + else if (I16 == in1_dtype || I8 == in1_dtype) + { + in1_dtype = I32; + } + + if (F16 == out_dtype) + { + out_dtype = F32; + } + else if (I16 == out_dtype || I8 == out_dtype) + { + out_dtype = I32; + } + + key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d); + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "eltwise_ops_helper", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + float outputScale = vsi_nn_get_tensor_scale(outputs[0]); + float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float input0Scale = vsi_nn_get_tensor_scale(inputs[0]); + float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input1Scale = vsi_nn_get_tensor_scale(inputs[1]); + float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]); + int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod"); + + outputScale = 1.0f / outputScale; + input0Tail = -(input0Tail * input0Scale); + input1Tail = -(input1Tail * input1Scale); + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + outputs[0]->attr.dim_num ) ) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2); + + status = _query_kernel( kernel, inputs, outputs, image_2d); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + size_t node_params_num = MOD_QUANT_PARAM_NUM; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod ); + node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale ); + node_params[5] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail ); + node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale ); + node_params[7] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail ); + node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); + node_params[9] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CL( mod, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c index bed0f91..c36851e 100644 --- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c @@ -48,7 +48,7 @@ __BEGIN_DECLS #define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE ) \ { ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, 0 ), \ - CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"to"STR(OUT_DTYPE)), \ + CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \ _ROI_ALIGN_KERNEL_SOURCE(IN0_DTYPE) } typedef struct @@ -61,6 +61,7 @@ typedef struct static const _kernel_map_type _roi_align_kernel_map[] = { PACK_KERNEL_MAP(F32, F32, I32, F32), + PACK_KERNEL_MAP(U8, U16, I32, U8), }; @@ -82,20 +83,28 @@ static vx_param_description_t _roi_align_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; #define _ROI_ALIGN_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def ) -#define SCALAR_SPATIAL_X_SCALE (4) -#define SCALAR_SPATIAL_Y_SCALE (5) -#define SCALAR_INPUT_WIDTH (6) -#define SCALAR_INPUT_HEIGHT (7) -#define SCALAR_RCP_OF_OUTPUT_WIDTH (8) -#define SCALAR_RCP_OF_OUTPUT_HEIGHT (9) -#define SCALAR_SAMPLING_X_RATIO (10) -#define SCALAR_SAMPLING_Y_RATIO (11) -#define SCALAR_DEPTH (12) +#define SCALAR_INPUT_SCALE (4) +#define SCALAR_INPUT_TAIL (5) +#define SCALAR_OUTPUT_SCALE (6) +#define SCALAR_OUTPUT_ZP (7) +#define SCALAR_SPATIAL_X_SCALE (8) +#define SCALAR_SPATIAL_Y_SCALE (9) +#define SCALAR_INPUT_WIDTH (10) +#define SCALAR_INPUT_HEIGHT (11) +#define SCALAR_RCP_OF_OUTPUT_WIDTH (12) +#define SCALAR_RCP_OF_OUTPUT_HEIGHT (13) +#define SCALAR_SAMPLING_X_RATIO (14) +#define SCALAR_SAMPLING_Y_RATIO (15) +#define SCALAR_DEPTH (16) -#define ROI_ALIGN_PARAM_NUM 13 +#define ROI_ALIGN_PARAM_NUM 17 #define ROI_ALIGN_QUANT_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def ) /* @@ -185,6 +194,7 @@ static vsi_status _query_kernel in0_dtype = in0_dtype == F16 ? F32 : in0_dtype; in1_dtype = in1_dtype == F16 ? F32 : in1_dtype; + out_dtype = out_dtype == F16 ? F32 : out_dtype; key = ROI_ALIGN_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d ); @@ -241,8 +251,14 @@ static vsi_nn_kernel_node_t _setup float height_ratio = vsi_nn_kernel_param_get_float32( params, "height_ratio" ); int32_t width_sample_num = vsi_nn_kernel_param_get_int32( params, "width_sample_num" ); int32_t height_sample_num = vsi_nn_kernel_param_get_int32( params, "height_sample_num" ); - float width_scale = 1.0f / width_ratio; - float height_scale = 1.0f / height_ratio; + float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]); + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + float input_tail = -(input_zp * input_scale); + float roi_scale = vsi_nn_get_tensor_scale(inputs[1]); + float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]); + float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]); + float width_scale = roi_scale / width_ratio; + float height_scale = roi_scale / height_ratio; float in_width = (float)(inputs[0]->attr.size[0]); float in_height = (float)(inputs[0]->attr.size[1]); float rcp_of_out_width = 1.0f / (float)(outputs[0]->attr.size[0]); @@ -287,6 +303,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM, reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num ); + node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale ); + node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &input_tail ); + node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale ); + node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp ); node_params[SCALAR_SPATIAL_X_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &width_scale ); node_params[SCALAR_SPATIAL_Y_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &height_scale ); node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &in_width ); @@ -299,6 +319,10 @@ static vsi_nn_kernel_node_t _setup /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); + vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_X_SCALE] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_Y_SCALE] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] ); diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c index 1eba1c2..5ec59b1 100644 --- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c @@ -115,7 +115,7 @@ static vsi_status cal_scatter_nd_tensor_reshape_size return status; } -#define VSI_NN_MAX_IMAGE_WIDTH (65536) +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH newDim[0] = 0; for(i = 0; i < dims_num; ++i) @@ -333,4 +333,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( scatter_nd, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c index 2ab4a16..fd72a9d 100644 --- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c @@ -108,7 +108,7 @@ static vsi_status cal_scatter_nd_update_tensor_reshape_size return status; } -#define VSI_NN_MAX_IMAGE_WIDTH (65536) +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH newDim[0] = 0; for(i = 0; i < dims_num; ++i) @@ -373,4 +373,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( scatter_nd_update, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c index ad99bc6..a3d5428 100644 --- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -49,6 +48,13 @@ __BEGIN_DECLS CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \ _TOPK_KERNEL_SOURCE } +#define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \ + ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) ) +#define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \ + { TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \ + CVIVANTE_NAMESPACE("cl.topk_odd_even_sort_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \ + "topk_odd_even_sort" } + typedef struct { uint32_t key; @@ -84,6 +90,14 @@ static const _kernel_map_type _topk_kernel_map[] = PACK_KERNEL_MAP( I32, I32, 6 ), }; +static const _kernel_map_type _topk_odd_even_sort_kernel_map[] = +{ + // Register kernel here + PACK_ODD_EVEN_SORT_KERNEL_MAP( F32, F32 ), + PACK_ODD_EVEN_SORT_KERNEL_MAP( U32, U32 ), + PACK_ODD_EVEN_SORT_KERNEL_MAP( I32, I32 ), +}; + /* * Kernel params */ @@ -99,6 +113,19 @@ static vx_param_description_t _topk_kernel_param_def[] = #define _TOPK_PARAM_NUM _cnt_of_array( _topk_kernel_param_def ) #define SCALAR_INPUT_NUM_STAGES (3) #define SCALAR_INPUT_WIDTH (4) + +static vx_param_description_t _topk_odd_even_sort_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _TOPK_ODD_EVEN_SORT_PARAM_NUM _cnt_of_array( _topk_odd_even_sort_kernel_param_def ) +#define SCALAR_INPUT_SIZE (5) /* * Kernel initializer */ @@ -140,9 +167,47 @@ DEF_KERNEL_INITIALIZER(_topk_initializer) final: #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } SAFE_FREE_TENSOR_ATTR(input_attr); +#undef SAFE_FREE_TENSOR_ATTR return status; } /* _topk_initializer() */ +DEF_KERNEL_INITIALIZER(_topk_odd_even_sort_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 2, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_size_array_t * in_shape = NULL; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + + in_shape = input_attr->shape; + + gpu_param.global_scale[0] = 1; + gpu_param.global_scale[1] = 1; + gpu_param.local_size[0] = 32; + gpu_param.local_size[1] = 1; + gpu_param.global_size[0] = 32; + gpu_param.global_size[1] = in_shape->data[1]; + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: +#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; } + SAFE_FREE_TENSOR_ATTR(input_attr); +#undef SAFE_FREE_TENSOR_ATTR + return status; +} /* _topk_odd_even_sort_initializer() */ /* * Query kernel @@ -215,6 +280,72 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ +static vsi_status _query_odd_even_sort_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _topk_odd_even_sort_kernel_map; + size_t kernel_map_size = _cnt_of_array( _topk_odd_even_sort_kernel_map ); + vx_param_description_t * param_def = _topk_odd_even_sort_kernel_param_def; + vx_kernel_initialize_f initializer = _topk_odd_even_sort_initializer; +#define _PACK_SELECT_KEY( in_type, out_type ) \ + ( (in_type) | (out_type << 8) ) + uint32_t key = 0; + uint32_t i; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + switch (_PACK_SELECT_KEY(in_dtype, out_dtype)) + { + case _PACK_SELECT_KEY(F32, F32): + case _PACK_SELECT_KEY(F16, F16): + key = TOPK_ODD_EVEN_SORT_HASH_KEY( F32, F32 ); + break; + case _PACK_SELECT_KEY(U32, U32): + case _PACK_SELECT_KEY(U16, U16): + case _PACK_SELECT_KEY(U8, U8): + key = TOPK_ODD_EVEN_SORT_HASH_KEY( U32, U32 ); + break; + case _PACK_SELECT_KEY(I32, I32): + case _PACK_SELECT_KEY(I16, I16): + case _PACK_SELECT_KEY(I8, I8): + key = TOPK_ODD_EVEN_SORT_HASH_KEY( I32, I32 ); + break; + default: + break; + } +#undef _PACK_SELECT_KEY + + for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + { + if ( kernel_map[i].key == key ) + { + break; + } + } + if ( i < (uint32_t)kernel_map_size ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = _cnt_of_array( _topk_odd_even_sort_kernel_param_def ); + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1, + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ static vsi_nn_kernel_node_t _setup ( @@ -228,16 +359,19 @@ static vsi_nn_kernel_node_t _setup ) { vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM]; + vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM]; vsi_nn_kernel_node_t node = NULL; vsi_size_t block_size = inputs[0]->attr.size[0]; vsi_size_t block_num = 1; uint32_t i = 0; - vsi_nn_tensor_t* rs_tensors[3] = { NULL }; + vsi_nn_tensor_t* rs_tensors[5] = { NULL }; + vsi_nn_tensor_attr_t attr; vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; int32_t width = (int32_t)block_size; int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k"); int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f)); + vsi_bool is_odd_even_sort = FALSE; + size_t param_num = _TOPK_PARAM_NUM; for (i = 1; i < inputs[0]->attr.dim_num; i ++) { @@ -257,26 +391,58 @@ static vsi_nn_kernel_node_t _setup rs_tensors[0] = vsi_nn_reshape_tensor( graph, inputs[0], shape[0], 2 ); - rs_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], shape[1], 2 ); - rs_tensors[2] = vsi_nn_reshape_tensor( graph, - outputs[1], shape[1], 2 ); - status = _query_kernel( kernel, inputs, outputs, num_stages ); + if (num_stages < 7) + { + status = _query_kernel( kernel, inputs, outputs, num_stages ); + + rs_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shape[1], 2 ); + rs_tensors[2] = vsi_nn_reshape_tensor( graph, + outputs[1], shape[1], 2 ); + } + else + { + status = _query_odd_even_sort_kernel( kernel, inputs, outputs ); + is_odd_even_sort = TRUE; + param_num = _TOPK_ODD_EVEN_SORT_PARAM_NUM; + + memcpy( &attr, &(rs_tensors[0]->attr), sizeof(vsi_nn_tensor_attr_t) ); + rs_tensors[1] = vsi_nn_CreateTensor( graph, &attr ); + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + rs_tensors[2] = vsi_nn_CreateTensor( graph, &attr ); + + rs_tensors[3] = vsi_nn_reshape_tensor( graph, + outputs[0], shape[1], 2 ); + rs_tensors[4] = vsi_nn_reshape_tensor( graph, + outputs[1], shape[1], 2 ); + + input_num = 3; + } if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); if ( node ) { /* Set inputs and outputs */ - vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM, - rs_tensors, input_num, &rs_tensors[1], output_num ); + vsi_nn_kernel_node_pack_io( node_params, param_num, + rs_tensors, input_num, &rs_tensors[input_num], output_num ); /* Pass parameters to node. */ - node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create( - graph, I32, &num_stages ); - node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( - graph, I32, &width ); - status = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM ); + if (is_odd_even_sort) + { + node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create( + graph, I32, &width ); + } + else + { + node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create( + graph, I32, &num_stages ); + node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( + graph, I32, &width ); + } + + status = vsi_nn_kernel_node_pass_param( node, node_params, param_num ); CHECK_STATUS_FAIL_GOTO( status, final ); } } @@ -284,13 +450,25 @@ final: vsi_safe_release_tensor(rs_tensors[0]); vsi_safe_release_tensor(rs_tensors[1]); vsi_safe_release_tensor(rs_tensors[2]); - if (node_params[SCALAR_INPUT_NUM_STAGES]) + vsi_safe_release_tensor(rs_tensors[3]); + vsi_safe_release_tensor(rs_tensors[4]); + if (is_odd_even_sort) { - vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] ); + if (node_params[SCALAR_INPUT_SIZE]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SIZE] ); + } } - if (node_params[SCALAR_INPUT_WIDTH]) + else { - vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] ); + if (node_params[SCALAR_INPUT_NUM_STAGES]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] ); + } + if (node_params[SCALAR_INPUT_WIDTH]) + { + vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] ); + } } return node; diff --git a/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c new file mode 100644 index 0000000..29f333d --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/cumsum_cpu.c @@ -0,0 +1,260 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (3) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (1) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.cumsum") + +DEF_KERNEL_EXECUTOR(_cumsum_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[2] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t i = 0; + int32_t axisSize = 1, innerSize = 1, outerSize = 1; + int32_t axis = 0, exclusive = 0, reverse = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &exclusive); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + { + int32_t dims_num = (int32_t)attr[1]->shape->size; + int32_t inner = 0; + int32_t outer = 0; + + for(i = 0; i < axis; ++i) + { + innerSize *= (int32_t)attr[0]->shape->data[i]; + } + + axisSize = (int32_t)attr[0]->shape->data[i++]; + + for(; i < dims_num; ++i) + { + outerSize *= (int32_t)attr[0]->shape->data[i]; + } + + for ( outer = 0; outer < outerSize; ++outer) + { + for ( inner = 0; inner < innerSize; ++inner) + { + float sum = .0f; + + if (exclusive && reverse) + { + int32_t idx_out = (outer * axisSize + axisSize - 1) * innerSize + inner; + buffer[1][idx_out] = sum; + for (i = axisSize - 1; i > 0; i--) + { + int32_t idx = (outer * axisSize + i) * innerSize + inner; + float value = buffer[0][idx]; + idx_out = (outer * axisSize + i - 1) * innerSize + inner; + sum += value; + buffer[1][idx_out] = sum; + } + } + else if (exclusive) + { + int32_t idx_out = outer * axisSize * innerSize + inner; + buffer[1][idx_out] = sum; + for (i = 0; i < axisSize - 1; ++i) + { + int32_t idx = (outer * axisSize + i) * innerSize + inner; + float value = buffer[0][idx]; + idx_out = (outer * axisSize + i + 1) * innerSize + inner; + sum += value; + buffer[1][idx_out] = sum; + } + } + else if (reverse) + { + for (i = axisSize - 1; i >= 0; i--) + { + int32_t idx = (outer * axisSize + i) * innerSize + inner; + float value = buffer[0][idx]; + sum += value; + buffer[1][idx] = sum; + } + } + else + { + for (i = 0; i < axisSize; ++i) + { + // i * innerSize + inner + outer * innerSize * axisSize + int32_t idx = (outer * axisSize + i) * innerSize + inner; + float value = buffer[0][idx]; + sum += value; + buffer[1][idx] = sum; + } + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for ( i = 0; i < 2; i ++ ) + { + if ( buffer[i] ) + { + free( buffer[i] ); + } + } + for ( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _cumsum_exec() */ +/* + * Kernel params + */ +static vx_param_description_t _cumsum_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _CUMSUM_PARAM_NUM _cnt_of_array( _cumsum_kernel_param_def ) + +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel + ) +{ + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _cumsum_exec; + kernel->info.parameters = _cumsum_kernel_param_def; + kernel->info.numParams = _CUMSUM_PARAM_NUM; + + return VSI_SUCCESS; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + + status = _query_kernel( inputs, outputs, kernel ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 2; + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM, + inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM ); + + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive ); + backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM ); + CHECK_STATUS( status ); + vsi_nn_kernel_scalar_release( &backend_params[2] ); + vsi_nn_kernel_scalar_release( &backend_params[3] ); + vsi_nn_kernel_scalar_release( &backend_params[4] ); + } + else + { + status = VSI_FAILURE; + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( cumsum, _setup ) diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c index 7c6c480..061d5bc 100644 --- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c @@ -50,6 +50,9 @@ typedef enum UNARY_HGELU, UNARY_SELU, UNARY_CELU, + UNARY_RCP, + UNARY_SIGN, + UNARY_SOFTSIGN, } unary_type_e; @@ -145,6 +148,21 @@ static float celu_eval(float x, float alpha) return positive + negative; } +static float rcp_eval(float x) +{ + return 1 / x; +} + +static float sign_eval(float x) +{ + return x > 0 ? 1.0f : x < 0 ? -1.0f : 0; +} + +static float softsign_eval(float x) +{ + return x / (1.0f + vsi_abs(x)); +} + DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) ( vsi_nn_kernel_node_t node, @@ -227,6 +245,15 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec) case UNARY_CELU: data = celu_eval(data, alpha); break; + case UNARY_RCP: + data = rcp_eval(data); + break; + case UNARY_SIGN: + data = sign_eval(data); + break; + case UNARY_SOFTSIGN: + data = softsign_eval(data); + break; default: break; } @@ -360,4 +387,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu, UNARY_GELU ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu, UNARY_HGELU ) REGISTER_ELTWISE_UNARY_BACKEND_CPU( selu, UNARY_SELU ) -REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu, UNARY_CELU ) \ No newline at end of file +REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu, UNARY_CELU ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( rcp, UNARY_RCP ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( sign, UNARY_SIGN ) +REGISTER_ELTWISE_UNARY_BACKEND_CPU( softsign, UNARY_SOFTSIGN ) \ No newline at end of file diff --git a/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c new file mode 100644 index 0000000..900451a --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/maxpoolwithargmax_cpu.c @@ -0,0 +1,284 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _CPU_ARG_NUM (8) +#define _CPU_INPUT_NUM (1) +#define _CPU_OUTPUT_NUM (2) +#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM) +#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.maxpoolwithargmax") + +#define FP32_MIN -3.4e38 + +/* + * Kernel params + */ +static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED} + // Add kererl parameters here +}; +#define _MAXPOOLWITHARGMAX_PARAM_NUM _cnt_of_array( _maxpoolwithargmax_kernel_param_def ) + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_maxpoolwithargmax_exec) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VX_FAILURE; + vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; + float * buffer[_CPU_IO_NUM] = { NULL }; + size_t out_elements = 0; + vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; + int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0; + int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0; + int32_t i = 0; + + tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; + tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; + tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; + + attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); + attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final ); + attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); + CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); + + out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_x); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &ksize_y); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &stride_x); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &stride_y); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_left); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_right); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_top); + status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_bottom); + CHECK_STATUS_FAIL_GOTO(status, final ); + + buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final ); + + buffer[1] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final ); + memset( buffer[1], 0, out_elements * sizeof(float) ); + + buffer[2] = (float *)malloc( out_elements * sizeof(float) ); + CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final ); + memset( buffer[2], 0, out_elements * sizeof(float) ); + + { + int32_t dims_num = (int32_t)attr[1]->shape->size; + int32_t batch = dims_num > 3 ? (int32_t)attr[1]->shape->data[3] : 1; + int32_t depth = dims_num > 2 ? (int32_t)attr[1]->shape->data[2] : 1; + int32_t height_o = (int32_t)attr[1]->shape->data[1]; + int32_t width_o = (int32_t)attr[1]->shape->data[0]; + int32_t width = (int32_t)attr[0]->shape->data[0]; + int32_t height = (int32_t)attr[0]->shape->data[1]; + int32_t b = 0, d = 0, j = 0; + int32_t output_base = 0; + int32_t input_base = 0; + + for (b = 0; b < batch; b++) + { + for (d = 0; d < depth; d++) + { + output_base = b * depth * height_o * width_o + d * height_o * width_o; + input_base = b * depth * height * width + d * height * width; + for (j = 0; j < height_o; j++) + { + for (i = 0; i < width_o; i++) + { + int32_t hstart = j * stride_y - pad_top; + int32_t wstart = i * stride_x - pad_left; + int32_t hend = vsi_nn_min(hstart + ksize_y, height); + int32_t wend = vsi_nn_min(wstart + ksize_x, width); + int32_t pool_index = output_base + j * width_o + i; + int32_t h = 0, w = 0; + int32_t index_max = 0; + float value_max = (float)FP32_MIN; + + hstart = vsi_nn_max(hstart, 0); + wstart = vsi_nn_max(wstart, 0); + + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + int32_t index = input_base + h * width + w; + float data = buffer[0][index]; + + if (data > value_max) + { + value_max = data; + index_max = index; + } + } + } + buffer[1][pool_index] = value_max; + buffer[2][pool_index] = (float)index_max; + } + } + } + } + } + + status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1], + buffer[1], out_elements ); + status |= vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2], + buffer[2], out_elements ); + CHECK_STATUS_FAIL_GOTO( status, final ); + +final: + for ( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if ( buffer[i] ) + { + free( buffer[i] ); + } + } + for ( i = 0; i < _CPU_IO_NUM; i ++ ) + { + if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); } + } + return status; +} /* _maxpoolwithargmax_exec() */ + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + /* Add extra params */ + ) +{ + vsi_status status = VSI_FAILURE; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _maxpoolwithargmax_exec; + kernel->info.parameters = _maxpoolwithargmax_kernel_param_def; + kernel->info.numParams = _MAXPOOLWITHARGMAX_PARAM_NUM; + status = VSI_SUCCESS; + return status; +} /* _query_kernel() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + + int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x"); + int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y"); + int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x"); + int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y"); + int32_t pad_left = vsi_nn_kernel_param_get_int32(params, "pad_left"); + int32_t pad_right = vsi_nn_kernel_param_get_int32(params, "pad_right"); + int32_t pad_top = vsi_nn_kernel_param_get_int32(params, "pad_top"); + int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom"); + + status = _query_kernel( kernel, inputs, outputs ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + int32_t index = 3; + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top ); + node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + vsi_nn_kernel_scalar_release( &node_params[4] ); + vsi_nn_kernel_scalar_release( &node_params[5] ); + vsi_nn_kernel_scalar_release( &node_params[6] ); + vsi_nn_kernel_scalar_release( &node_params[7] ); + vsi_nn_kernel_scalar_release( &node_params[8] ); + vsi_nn_kernel_scalar_release( &node_params[9] ); + vsi_nn_kernel_scalar_release( &node_params[10] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( maxpoolwithargmax, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c b/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c new file mode 100644 index 0000000..b391edd --- /dev/null +++ b/src/tim/vx/internal/src/kernel/cpu/mod_cpu.c @@ -0,0 +1,247 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) +#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.mod") + +/* + * Kernel params + */ +static vx_param_description_t _mod_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _MOD_PARAM_NUM _cnt_of_array( _mod_kernel_param_def ) + +static vsi_ssize_t _expand_offset + ( + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape + ) +{ + vsi_size_t i; + vsi_ssize_t offset = 0; + + for( i = 0; i < rank && index; i ++ ) + { + if( shape[i] == out_shape[i] ) + { + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); + } + index /= out_shape[i]; + } + return offset; +} + +/* + * Kernel function + */ +DEF_KERNEL_EXECUTOR(_compute) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + int32_t isfmod = 0; + vsi_nn_kernel_dtype_e input0_dtype = F16; + vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; + float* f32_in_buffer[_INPUT_NUM] = {NULL}; + float* f32_out_buffer[_OUTPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t* in_attr[_INPUT_NUM] = {NULL}; + vsi_nn_kernel_tensor_attr_t* out_attr[_OUTPUT_NUM] = {NULL}; + vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + + /* prepare data */ + vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &isfmod); + for (i = 0; i < _INPUT_NUM; i++) { + input[i] = (vsi_nn_kernel_tensor_t)param[i]; + in_attr[i] = vsi_nn_kernel_tensor_attr_create(input[i]); + vsi_nn_kernel_tensor_attr_get_stride(in_attr[i], in_stride_size[i]); + f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer(input[i], in_attr[i], TRUE); + CHECK_PTR_FAIL_GOTO(f32_in_buffer[i], "Create input0 buffer fail.", final); + } + + input0_dtype = in_attr[0]->dtype; + if (input0_dtype == F16 || input0_dtype == F32 || input0_dtype == BF16) { + isfmod = 1; + } + + for (i = 0; i < _OUTPUT_NUM; i++) + { + output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM]; + out_attr[i] = vsi_nn_kernel_tensor_attr_create(output[i]); + vsi_nn_kernel_tensor_attr_get_stride(out_attr[i], out_stride_size[i]); + out_elements[i] = vsi_nn_kernel_tensor_attr_get_size(out_attr[i]); + out_bytes[i] = out_elements[i] * sizeof(float); + f32_out_buffer[i] = (float*)malloc(out_bytes[i]); + CHECK_PTR_FAIL_GOTO(f32_out_buffer[i], "Create output buffer fail.", final); + memset(f32_out_buffer[i], 0, out_bytes[i]); + } + + for (i = 0; i < out_elements[0]; i++) + { + vsi_ssize_t in0_offset = 0; + vsi_ssize_t in1_offset = 0; + float in0 = 0; + float in1 = 0; + + in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size, + in_stride_size[0], out_attr[0]->shape->data ); + in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size, + in_stride_size[1], out_attr[0]->shape->data ); + in0 = f32_in_buffer[0][in0_offset]; + in1 = f32_in_buffer[1][in1_offset]; + if (isfmod) + { + f32_out_buffer[0][i] = (float)fmod(in0,in1); + } + else + { + f32_out_buffer[0][i] = in0 - in1 * (float)floor(in0 / in1); + } + } + + /* save data */ + for (i = 0; i < _OUTPUT_NUM; i++) { + status = vsi_nn_kernel_tensor_write_from_float( + output[i], out_attr[i], f32_out_buffer[i], out_elements[i]); + CHECK_STATUS_FAIL_GOTO(status, final); + } + +final: + for (i = 0; i < _INPUT_NUM; i++) { + if (f32_in_buffer[i]) { + free(f32_in_buffer[i]); + f32_in_buffer[i] = NULL; + } + + if (in_attr[i]) { + vsi_nn_kernel_tensor_attr_release(&in_attr[i]); + } + } + + for (i = 0; i < _OUTPUT_NUM; i++) { + if (f32_out_buffer[i]) { + free(f32_out_buffer[i]); + f32_out_buffer[i] = NULL; + } + + if (out_attr[i]) { + vsi_nn_kernel_tensor_attr_release(&out_attr[i]); + } + } + + return status; +} /* _compute() */ + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs + ) +{ + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME ); + kernel->info.function = _compute; + kernel->info.parameters = _mod_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _mod_kernel_param_def ); + + return VSI_SUCCESS; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM]; + vsi_nn_kernel_node_t node = NULL; + int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod"); + + status = _query_kernel( kernel, inputs, outputs /* Add extra params */ ); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + } + } + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_CPU( mod, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c index 1af66f0..845c167 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb888_planar_cpu.c @@ -55,8 +55,8 @@ __BEGIN_DECLS static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, - {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -90,12 +90,16 @@ DEF_KERNEL_EXECUTOR(_compute) uint32_t i = 0; int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0; float mean[3] = {0}, scale = 1; + vsi_bool is_rgb888 = tensors[1] == NULL; for (i = 0; i < _CPU_IO_NUM; i++) { tensors[i] = (vsi_nn_kernel_tensor_t)param[i]; - attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] ); - CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final ); + if (tensors[i]) + { + attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] ); + CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final ); + } } out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] ); @@ -113,8 +117,11 @@ DEF_KERNEL_EXECUTOR(_compute) for (i = 0; i < 3; i++) { - buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE ); - CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final ); + if (tensors[i]) + { + buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE ); + CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final ); + } buffer[i + 3] = (float *)malloc( out_elements * sizeof(float) ); CHECK_PTR_FAIL_GOTO( buffer[i + 3], "Create output buffer fail.", final ); @@ -125,12 +132,17 @@ DEF_KERNEL_EXECUTOR(_compute) int32_t line1[2], line2[2]; int32_t dx = 0, dy = 0, idx = 0; int32_t src_width = (int32_t)attr[0]->shape->data[0]; + int32_t src_height = (int32_t)attr[0]->shape->data[1]; int32_t dst_width = (int32_t)attr[3]->shape->data[0]; int32_t dst_height = (int32_t)attr[3]->shape->data[1]; uint8_t result = 0; + int32_t offset = 0; + int32_t index = 0; for ( idx = 0; idx < 3; idx ++) { + offset = is_rgb888 ? idx * src_width * src_height : 0; + index = is_rgb888 ? 0 : idx; for ( dy = 0; dy < (int32_t)dst_height; dy ++) { for ( dx = 0; dx < (int32_t)dst_width; dx ++) @@ -170,10 +182,10 @@ DEF_KERNEL_EXECUTOR(_compute) sy += yOffset; source_index = (sx + sy * src_width); - line1[0] = (int32_t)buffer[idx][source_index]; - line1[1] = (int32_t)buffer[idx][source_index + 1]; - line2[0] = (int32_t)buffer[idx][source_index + src_width]; - line2[1] = (int32_t)buffer[idx][source_index + src_width + 1]; + line1[0] = (int32_t)buffer[index][source_index + offset]; + line1[1] = (int32_t)buffer[index][source_index + 1 + offset]; + line2[0] = (int32_t)buffer[index][source_index + src_width + offset]; + line2[1] = (int32_t)buffer[index][source_index + src_width + 1 + offset]; temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10); temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10); @@ -184,10 +196,10 @@ DEF_KERNEL_EXECUTOR(_compute) } else { - int32_t offset = xOffset + yOffset * src_width; - source_index = dx + dy * src_width + offset; - finalVal = (buffer[0][source_index] - mean[idx]) * scale; - buffer[1][output_index] = finalVal; + int32_t ofset = xOffset + yOffset * src_width; + source_index = dx + dy * src_width + ofset + offset; + finalVal = (buffer[index][source_index] - mean[idx]) * scale; + buffer[idx + 3][output_index] = finalVal; } } } diff --git a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c index a5f0467..82e9c1a 100644 --- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c @@ -209,16 +209,15 @@ DEF_KERNEL_EXECUTOR(_compute) for (n = 0; n < num_rois; n++) { uint32_t batchId = (uint32_t)f32_in_buffer[2][n]; - float scale = (in_attr[1]->dtype == U16) ? 0.125f : 1.0f; float qx1 = f32_in_buffer[1][n * kRoiDim]; float qy1 = f32_in_buffer[1][n * kRoiDim + 1]; float qx2 = f32_in_buffer[1][n * kRoiDim + 2]; float qy2 = f32_in_buffer[1][n * kRoiDim + 3]; - float x1 = qx1 * scale; - float x2 = qx2 * scale; - float y1 = qy1 * scale; - float y2 = qy2 * scale; + float x1 = qx1; + float x2 = qx2; + float y1 = qy1; + float y2 = qy2; float roi_anchor_x = x1 * width_scale; float roi_anchor_y = y1 * height_scale; float roi_dims_x = vsi_nn_max((x2 - x1) * width_scale, 1.0f); diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c new file mode 100644 index 0000000..cad8476 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c @@ -0,0 +1,770 @@ +/**************************************************************************** +* +* Copyright (c) 2019 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_error.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +__BEGIN_DECLS + +/* + * Define kernel meta. + */ + +#define KERNEL_SOURCE_1 "cumsum" +#define KERNEL_SOURCE_2 "cumsum_2d" +#define KERNEL_SOURCE_3 "cumsum_bf16" +#define KERNEL_SOURCE_4 "cumsum_f16_u8" + +// Add kernel hashtable here +#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \ + ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) + +#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \ + SOURCE }, + +#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \ + { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \ + CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \ + SOURCE }, + +static const struct { + uint32_t key; + char* function_name; + const char* source_name; + } cumsum_map[] = +{ + HASH_CUMSUM_KERNELS(0, U8, U8, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(0, I8, I8, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(0, I16, I16, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(0, F16, F16, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(0, BF16, BF16, KERNEL_SOURCE_3) + HASH_CUMSUM_KERNELS(1, U8, U8, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(1, I8, I8, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(1, I16, I16, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(1, F16, F16, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(1, BF16, BF16, KERNEL_SOURCE_3) + HASH_CUMSUM_KERNELS(2, U8, U8, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(2, I8, I8, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(2, I16, I16, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(2, F16, F16, KERNEL_SOURCE_1) + HASH_CUMSUM_KERNELS(2, BF16, BF16, KERNEL_SOURCE_3) + HASH_CUMSUM_KERNELS_2D(0, U8, U8, KERNEL_SOURCE_2) + HASH_CUMSUM_KERNELS_2D(0, I8, I8, KERNEL_SOURCE_2) + HASH_CUMSUM_KERNELS_2D(0, I16, I16, KERNEL_SOURCE_2) + HASH_CUMSUM_KERNELS_2D(0, F16, F16, KERNEL_SOURCE_2) + HASH_CUMSUM_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_3) + HASH_CUMSUM_KERNELS_2D(1, U8, U8, KERNEL_SOURCE_2) + HASH_CUMSUM_KERNELS_2D(1, I8, I8, KERNEL_SOURCE_2) + HASH_CUMSUM_KERNELS_2D(1, I16, I16, KERNEL_SOURCE_2) + HASH_CUMSUM_KERNELS_2D(1, F16, F16, KERNEL_SOURCE_2) + HASH_CUMSUM_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_3) + HASH_CUMSUM_KERNELS(0, F16, U8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS(0, F16, I8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS(0, F16, I16, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS(1, F16, U8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS(1, F16, I8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS(1, F16, I16, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS(2, F16, U8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS(2, F16, I8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS(2, F16, I16, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS_2D(0, F16, U8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS_2D(0, F16, I8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS_2D(0, F16, I16, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS_2D(1, F16, U8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS_2D(1, F16, I8, KERNEL_SOURCE_4) + HASH_CUMSUM_KERNELS_2D(1, F16, I16, KERNEL_SOURCE_4) +}; + +/* + * Kernel params + */ +static vx_param_description_t _cumsum_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _CUMSUM_PARAM_NUM _cnt_of_array( _cumsum_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_cumsum_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t shaderParam = { + 3, // workdim + {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image + {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread + {0, 0, 0}, // localWorkSize: local group size in thread + {0, 0, 0}}; // globalWorkSize: image size in thread + + int32_t axis = 0; + int32_t width = 0; + int32_t height = 0; + int32_t channel = 0; + int32_t w = 1; + int32_t h = 1; + int32_t c = 1; + uint32_t dim = 1; + vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; + vsi_size_array_t * input_shape = NULL; + int32_t input_zp = 0; + float input_scale = 1.0f; + float output_zp = 0; + float output_scale = 1.0f; + float in_out_zp_scale = 1.0f; + float in_out_scale = 1.0f; + + uint32_t pack_key = 0; + + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[0]->dfp.fl > 0) + { + input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); + } + else + { + input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); + } + } + else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + input_scale = attr[0]->asymm.scale; + input_zp = attr[0]->asymm.zero_point; + } + + if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) + { + if (attr[1]->dfp.fl > 0) + { + output_scale = (float)((int64_t)1 << attr[1]->dfp.fl); + } + else + { + output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl)); + } + } + else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + { + output_scale = 1.0f / attr[1]->asymm.scale; + output_zp = (float)attr[1]->asymm.zero_point; + } + + in_out_scale = input_scale * output_scale; + in_out_zp_scale = (float)in_out_scale * input_zp; + + input_shape = attr[0]->shape; + dim = (uint32_t)input_shape->size; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); + channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1); + + + if (axis == 0) + { + w = 1; + h = height; + c = channel; + } + else if (axis == 1) + { + w = width; + h = 1; + c = channel; + } + else if (axis == 2) + { + w = width; + h = height; + c = 1; + } + + shaderParam.global_scale[0] = 8; + if ((attr[0]->dtype == U8 || attr[0]->dtype == I8) + && (axis > 0)) + { + shaderParam.global_scale[0] = 16; + } + shaderParam.global_scale[1] = 1; + shaderParam.global_scale[2] = 1; + shaderParam.global_size[0] = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0]; + shaderParam.global_size[1] = h; + shaderParam.global_size[2] = c; + + status = vsi_nn_kernel_gpu_config( node, &shaderParam ); + CHECK_STATUS_FAIL_GOTO(status, OnError); + +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM) \ + (IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24)) + + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, axis, dim); + + { + uint16_t M0 = 0; + int32_t postShift = 0; + uint32_t multAndoutZP0[2] = {0}; + gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{ + 0xdddddddd, // TCfg + 0x44444444, // ASelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniAccSumVertF16toF16_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x33221100, 0x77665544, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAccSumVertU8toI32A_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00110000, 0x00330022, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAccSumVertU8toI32B_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00150004, 0x00370026, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAccSumVertU8toI32C_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00190008, 0x003b002a, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAccSumVertU8toI32D_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x001d000c, 0x003f002e, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniSumHorzF16toF16A_4x4 = {{ + 0x55150501, // TCfg + 0x00000000, // ASelt + 0x00100000, 0x32100210, // ABin + 0xaa2a0a02, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSumHorzF16toF16B_4x4 = {{ + 0x55150501, // TCfg + 0x00000000, // ASelt + 0x00540004, 0x76540654, // ABin + 0xaa2a0a02, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x3c003c00, 0x00000000, + 0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSumHorzF16toF16C_2x8 = {{ + 0x55551111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x37363534, // ABin + 0xaaaa2222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAccSumHorzF16toF16_2x8 = {{ + 0x55555555, // TCfg + 0x44444444, // ASelt + 0x73727170, 0x77767574, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniSumHorzU8toI16A_4x4 = {{ + 0x55150501, // TCfg + 0x00000000, // ASelt + 0x00100000, 0x32100210, // ABin + 0xaa2a0a02, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000700, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00010001, 0x00000000, + 0x00010001, 0x00000001, 0x00010001, 0x00010001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSumHorzU8toI16B_8x4 = {{ + 0x05550155, 0x55551555, // TCfg + 0x00418820, 0x41882000, 0x8820000a, 0x20018a41, 0x398a4188, // BinSelect + 0x00000700, // AccumType, ConstantType, and PostShift + 0x01010101, 0x00000001, 0x01010101, 0x00000101, + 0x01010101, 0x00010101, 0x01010101, 0x01010101 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSubZpI16toI16_2x8 = {{ + 0x99999999, // TCfg + 0x44444444, // ASelt + 0x03020100, 0x07060504, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00020001, 0x00030001, 0x00040001, + 0x00050001, 0x00060001, 0x00070001, 0x00080001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAccSumHorzI16toI32A_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00310030, 0x00330032, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniAccSumHorzI16toI32B_4x4 = {{ + 0x0d0d0d0d, // TCfg + 0x04040404, // ASelt + 0x00350034, 0x00370036, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniSetZeroF16_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt + 0x03020100, 0x07060504, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_quantize_multiplier_16bit( (double)input_scale * output_scale, &M0, &postShift); + multAndoutZP0[0] = (uint32_t)(M0); + multAndoutZP0[1] = (uint32_t)((attr[1]->asymm.zero_point << postShift) - input_zp * M0); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift ); + + status = vsi_nn_kernel_gpu_add_param(node, "width", &width); + status |= vsi_nn_kernel_gpu_add_param(node, "height", &height); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + + switch( pack_key ) + { + case _PACK_SELECT_KEY( U8, U8, 2, 3): + case _PACK_SELECT_KEY( I8, I8, 2, 3): + case _PACK_SELECT_KEY( I16, I16, 2, 3): + case _PACK_SELECT_KEY( F16, F16, 2, 3): + { + status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel); + status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( U8, U8, 0, 2): + case _PACK_SELECT_KEY( U8, U8, 1, 2): + case _PACK_SELECT_KEY( U8, U8, 0, 3): + case _PACK_SELECT_KEY( U8, U8, 1, 3): + case _PACK_SELECT_KEY( I8, I8, 0, 2): + case _PACK_SELECT_KEY( I8, I8, 1, 2): + case _PACK_SELECT_KEY( I8, I8, 0, 3): + case _PACK_SELECT_KEY( I8, I8, 1, 3): + case _PACK_SELECT_KEY( I16, I16, 0, 2): + case _PACK_SELECT_KEY( I16, I16, 1, 2): + case _PACK_SELECT_KEY( I16, I16, 0, 3): + case _PACK_SELECT_KEY( I16, I16, 1, 3): + case _PACK_SELECT_KEY( F16, F16, 0, 2): + case _PACK_SELECT_KEY( F16, F16, 1, 2): + case _PACK_SELECT_KEY( F16, F16, 0, 3): + case _PACK_SELECT_KEY( F16, F16, 1, 3): + { + status = vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( BF16, BF16, 0, 2): + case _PACK_SELECT_KEY( BF16, BF16, 1, 2): + case _PACK_SELECT_KEY( BF16, BF16, 0, 3): + case _PACK_SELECT_KEY( BF16, BF16, 1, 3): + case _PACK_SELECT_KEY( BF16, BF16, 2, 3): + { + status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniExtractOddData_2x8", &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + case _PACK_SELECT_KEY( F16, U8, 0, 2): + case _PACK_SELECT_KEY( F16, U8, 1, 2): + case _PACK_SELECT_KEY( F16, U8, 0, 3): + case _PACK_SELECT_KEY( F16, U8, 1, 3): + case _PACK_SELECT_KEY( F16, U8, 2, 3): + case _PACK_SELECT_KEY( F16, I8, 0, 2): + case _PACK_SELECT_KEY( F16, I8, 1, 2): + case _PACK_SELECT_KEY( F16, I8, 0, 3): + case _PACK_SELECT_KEY( F16, I8, 1, 3): + case _PACK_SELECT_KEY( F16, I8, 2, 3): + case _PACK_SELECT_KEY( F16, I16, 0, 2): + case _PACK_SELECT_KEY( F16, I16, 1, 2): + case _PACK_SELECT_KEY( F16, I16, 0, 3): + case _PACK_SELECT_KEY( F16, I16, 1, 3): + case _PACK_SELECT_KEY( F16, I16, 2, 3): + { + status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8); + status |= vsi_nn_kernel_gpu_add_param( + node, "multAndoutZP0", &multAndoutZP0); + status |= vsi_nn_kernel_gpu_add_param( + node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + } + break; + default: + break; + } + } +#undef _PACK_SELECT_KEY + +OnError: + if (attr[0]) + { + vsi_nn_kernel_tensor_attr_release( &attr[0] ); + attr[0] = NULL; + } + if (attr[1]) + { + vsi_nn_kernel_tensor_attr_release( &attr[1] ); + attr[1] = NULL; + } + + return status; +} + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_tensor_t* const* const inputs, + vsi_nn_tensor_t* const* const outputs, + vsi_nn_kernel_t* kernel, + const vsi_nn_kernel_param_t * params, + int32_t axis, + int32_t is_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e input0_dtype = U8; + vsi_nn_kernel_dtype_e output_dtype = U8; + uint32_t key = 0; + int i = 0; + + input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d); + + for( i = 0; i < _cnt_of_array(cumsum_map); i ++ ) + { + if ( cumsum_map[i].key == key ) + { + break; + } + } + if ( i < _cnt_of_array(cumsum_map) ) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", cumsum_map[i].function_name ); + kernel->info.parameters = _cumsum_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def ); + kernel->info.initialize = _cumsum_initializer; + + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + cumsum_map[i].source_name ); + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + cumsum_map[i].source_name ); + status = VSI_SUCCESS; + } + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t node = NULL; + vsi_size_t shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); + int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" ); + int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" ); + int32_t axis_new = 0; + int32_t is_2d = 0; + uint32_t rs_dim = 2; + int32_t i = 0; + + vsi_nn_kernel_optimize_softmax_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + shapes[0], &rs_dim, &axis_new); + if (exclusive || reverse || rs_dim > 3) + { + return NULL; + } + + if (rs_dim == 2) + { + is_2d = 1; + } + + reshape_tensors[0] = vsi_nn_reshape_tensor( graph, + inputs[0], shapes[0], (vsi_size_t)rs_dim ); + reshape_tensors[1] = vsi_nn_reshape_tensor( graph, + outputs[0], shapes[0], (vsi_size_t)rs_dim ); + + status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d); + if ( VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if ( node ) + { + uint32_t index = 2; + + /* Pass parameters to node. */ + vsi_nn_kernel_node_pack_io( tmp_params, _CUMSUM_PARAM_NUM, + reshape_tensors, 1, &reshape_tensors[1], 1 ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive ); + tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse ); + status = vsi_nn_kernel_node_pass_param( node, tmp_params, _CUMSUM_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &tmp_params[2] ); + vsi_nn_kernel_scalar_release( &tmp_params[3] ); + vsi_nn_kernel_scalar_release( &tmp_params[4] ); + } + } + + for (i = 0; i < 2; i++) + { + vsi_safe_release_tensor(reshape_tensors[i]); + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( cumsum, _setup ) diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c index 6e85e40..dbbfc6e 100644 --- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -53,6 +53,9 @@ typedef enum UNARY_HGELU, UNARY_SELU, UNARY_CELU, + UNARY_RCP, + UNARY_SIGN, + UNARY_SOFTSIGN, } unary_type_e; /* @@ -94,6 +97,34 @@ typedef enum #define HGELU_OPERATION hard_gelu #define SELU_OPERATION selu #define CELU_OPERATION celu +#define RCP_OPERATION rcp +#define SIGN_OPERATION sign +#define SOFTSIGN_OPERATION softsign + +#define ADD_UNARY_SH_KERNELS(name, source) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, F16, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, F16, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, I16, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, I16, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, U8, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, U8, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, I8, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, I8, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16, I16, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16, I16, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16, F16, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16, F16, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8, I8, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8, I8, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8, F16, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8, F16, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8, U8, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8, U8, source##_2D) \ + TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8, F16, source##_3D) \ + TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8, F16, source##_2D) \ + static const struct { uint32_t key; @@ -101,269 +132,22 @@ static const struct { const char* source_name; } _eltwise_unary_evis_kernel_map[] = { - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, U8, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, U8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I8, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, U8, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, U8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I8, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, BF16, BF16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, U8, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, U8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I8, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, U8, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, U8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I8, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, U8, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, U8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I8, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, BF16, BF16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16, I16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8, U8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8, I8, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8, F16, KERNEL_SOURCE1_3D) - TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_3D) + ADD_UNARY_SH_KERNELS(SIN, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(COS, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(EXP, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(LOG, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(SELU, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(CELU, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(NEG, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(RCP, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(SIGN, KERNEL_SOURCE1) + ADD_UNARY_SH_KERNELS(SOFTSIGN, KERNEL_SOURCE1) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, BF16, BF16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, BF16, BF16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16, I16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8, U8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8, I8, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8, F16, KERNEL_SOURCE1_2D) - TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_2D) - - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, U8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, I16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, I8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_3D) - - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_2D) - - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, I16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, U8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, I8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16, I16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8, U8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8, I8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_3D) - - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_2D) - - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, I16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, U8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, I8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16, I16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8, U8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8, I8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_3D) - - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_2D) - - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, I16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, U8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, I8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16, I16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8, U8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8, I8, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8, F16, KERNEL_SOURCE0_3D) - TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_3D) - - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_2D) - - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, I16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, I8, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, F16, KERNEL_SOURCE0_2D) - TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16, KERNEL_SOURCE0_2D) + ADD_UNARY_SH_KERNELS(HSIGMOID, KERNEL_SOURCE0) + ADD_UNARY_SH_KERNELS(MISH, KERNEL_SOURCE0) + ADD_UNARY_SH_KERNELS(ROUND, KERNEL_SOURCE0) + ADD_UNARY_SH_KERNELS(GELU, KERNEL_SOURCE0) + ADD_UNARY_SH_KERNELS(HGELU, KERNEL_SOURCE0) }; #undef SIN_OPERATION @@ -378,6 +162,9 @@ static const struct { #undef GELU_OPERATION #undef HGELU_OPERATION #undef CELU_OPERATION +#undef RCP_OPERATION +#undef SIGN_OPERATION +#undef SOFTSIGN_OPERATION /* * Kernel params */ @@ -509,6 +296,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ): case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ): + case _PACK_SELECT_KEY( UNARY_SOFTSIGN, BF16, BF16 ): { gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ 0x11111111, // TCfg @@ -815,5 +605,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU ) REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sign, UNARY_SIGN ) +REGISTER_ELTWISE_UNARY_BACKEND_EVIS( softsign, UNARY_SOFTSIGN ) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index 3dc67d2..499bc5a 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -222,7 +222,7 @@ static vsi_status get_gather_tensor_reshape_size uint32_t i = 0; vsi_size_t elementCnt = 1; vsi_size_t outerCnt = 1; -#define VSI_NN_MAX_IMAGE_WIDTH (65536) +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH for(i = 0; i < dims_num - batch_dims; ++i) { @@ -751,7 +751,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * kernel ) { -#define VSI_NN_MAX_BLOCK_SIZE (65536) +#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; @@ -795,12 +795,6 @@ static vsi_nn_kernel_node_t _setup reshape_tensors[2] = vsi_nn_reshape_tensor( graph, outputs[0], shapes[2], rs_dim ); - if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, - outputs[0]->attr.dim_num ) ) - { - return NULL; - } - status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array, is_batch); if ( VSI_SUCCESS == status) { diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c index 0692c07..05362bb 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c @@ -136,7 +136,7 @@ static vsi_status get_gather_nd_tensor_reshape_size vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; vsi_size_t elementCnt = 1; -#define VSI_NN_MAX_IMAGE_WIDTH (65536) +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH newDim[0] = 0; for(i = 0; i < dims_num; ++i) diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c index 9693c29..8a9971f 100644 --- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c @@ -34,8 +34,8 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_dtype_util.h" __BEGIN_DECLS @@ -45,78 +45,57 @@ __BEGIN_DECLS typedef enum { - INTERNAL_KERNEL_SUM_SQR, - INTERNAL_KERNEL_MEAN_VARI, + INTERNAL_KERNEL_SUMS, + INTERNAL_KERNEL_MEANS, INTERNAL_KERNEL_NORM, } _internal_kernel_e; -#define KERNEL_SOURCE_1 "group_normalization_i8" -#define KERNEL_SOURCE_2 "group_normalization_u8" -#define KERNEL_SOURCE_3 "group_normalization_i16" -#define KERNEL_SOURCE_4 "group_normalization_f16" -#define KERNEL_SOURCE_5 "group_normalization_u8_f16" -#define KERNEL_SOURCE_6 "group_normalization_i8_scale" -#define KERNEL_SOURCE_7 "group_normalization_i16_scale" -#define KERNEL_SOURCE_8 "group_normalization_f16_scale" +#define KERNEL_SOURCE_0 "group_normalization_0" +#define KERNEL_SOURCE_1 "group_normalization_1" +#define KERNEL_SOURCE_2 "group_normalization_2" -#define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(SRC0_TYPE) \ - CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE) +#define HASH_GROUPNORM_SUMS_SH_KERNEL_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.group_norm_sums_"#SRC0_TYPE) -#define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_2D_NAME(SRC0_TYPE) \ - CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE"_2D") +#define HASH_GROUPNORM_SUMS_SH_KERNEL_2D_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.group_norm_sums_"#SRC0_TYPE"_2D") -#define HASH_GROUPNORM_MEAN_VARI_SH_KERNEL_NAME \ - CVIVANTE_NAMESPACE("evis.group_norm_meanvari") - -#define HASH_GROUPNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE) - -#define HASH_GROUPNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") +#define HASH_GROUPNORM_MEANS_SH_KERNEL_NAME \ + CVIVANTE_NAMESPACE("evis.group_norm_means") #define HASH_GROUPNORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE) + CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE) #define HASH_GROUPNORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D") + CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE"_2D") // Add kernel hashtable here // Sum Sqr -#define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \ +#define HASH_GROUPNORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \ ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) -#define TENSOR_GROUPNORM_SUM_SQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 0), \ - HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(IN0_TYPE), \ +#define TENSOR_GROUPNORM_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_GROUPNORM_SUMS_SH_KERNEL_NAME(IN0_TYPE), \ SOURCE }, -#define TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 1), \ - HASH_GROUPNORM_SUM_SQR_SH_KERNEL_2D_NAME(IN0_TYPE), \ +#define TENSOR_GROUPNORM_SUMS_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_GROUPNORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_GROUPNORM_SUMS_SH_KERNEL_2D_NAME(IN0_TYPE), \ SOURCE }, -#define HASH_GROUPNORM_MEAN_VARI_KEY(_input0_type, _output_type) \ +#define HASH_GROUPNORM_MEANS_KEY(_input0_type, _output_type) \ ((_input0_type << 24) | (_output_type << 16)) #define TENSOR_GROUPNORM_MEAN_VARI_KERNELS(SOURCE) \ - { HASH_GROUPNORM_MEAN_VARI_KEY(F32, F32), \ - HASH_GROUPNORM_MEAN_VARI_SH_KERNEL_NAME, \ + { HASH_GROUPNORM_MEANS_KEY(F32, F32), \ + HASH_GROUPNORM_MEANS_SH_KERNEL_NAME, \ SOURCE }, // normalization #define HASH_GROUPNORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \ ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4)) -#define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_GROUPNORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \ - HASH_GROUPNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ - SOURCE }, - -#define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_GROUPNORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \ - HASH_GROUPNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ - SOURCE }, - #define TENSOR_GROUPNORM_SCALE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ { HASH_GROUPNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ HASH_GROUPNORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ @@ -134,73 +113,73 @@ typedef struct const char * source_name; } _kernel_map_type; -static const _kernel_map_type _groupnorm_sum_sqr_kernel_map[] = +static const _kernel_map_type _groupnorm_sums_kernel_map[] = { // Register kernel here - TENSOR_GROUPNORM_SUM_SQR_KERNELS( I8, F32, KERNEL_SOURCE_1 ) - TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I8, F32, KERNEL_SOURCE_1 ) - TENSOR_GROUPNORM_SUM_SQR_KERNELS( U8, F32, KERNEL_SOURCE_2 ) - TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_2 ) - TENSOR_GROUPNORM_SUM_SQR_KERNELS( I16, F32, KERNEL_SOURCE_3 ) - TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 ) - TENSOR_GROUPNORM_SUM_SQR_KERNELS( F16, F32, KERNEL_SOURCE_4 ) - TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 ) + TENSOR_GROUPNORM_SUMS_KERNELS(I8, F32, KERNEL_SOURCE_0) + TENSOR_GROUPNORM_SUMS_KERNELS_2D( I8, F32, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SUMS_KERNELS( I16, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SUMS_KERNELS( F16, F32, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 ) }; static const _kernel_map_type _groupnorm_mean_vari_kernel_map[] = { // Register kernel here - TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_0 ) }; static const _kernel_map_type _groupnorm_kernel_map[] = { // Register kernel here - TENSOR_GROUPNORM_KERNELS( I8, I8, KERNEL_SOURCE_1 ) - TENSOR_GROUPNORM_KERNELS_2D( I8, I8, KERNEL_SOURCE_1 ) - TENSOR_GROUPNORM_KERNELS( I8, F16, KERNEL_SOURCE_1 ) - TENSOR_GROUPNORM_KERNELS_2D( I8, F16, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I8, F16, I8, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F16, I8, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I8, F16, F16, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F16, F16, KERNEL_SOURCE_0 ) - TENSOR_GROUPNORM_KERNELS( U8, U8, KERNEL_SOURCE_2 ) - TENSOR_GROUPNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 ) - TENSOR_GROUPNORM_KERNELS( U8, F16, KERNEL_SOURCE_5 ) - TENSOR_GROUPNORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 ) + TENSOR_GROUPNORM_SCALE_KERNELS( U8, F16, U8, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F16, U8, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS( U8, F16, F16, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F16, F16, KERNEL_SOURCE_1 ) - TENSOR_GROUPNORM_KERNELS( I16, I16, KERNEL_SOURCE_3 ) - TENSOR_GROUPNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 ) - TENSOR_GROUPNORM_KERNELS( I16, F16, KERNEL_SOURCE_3 ) - TENSOR_GROUPNORM_KERNELS_2D( I16, F16, KERNEL_SOURCE_3 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I16, F16, I16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F16, I16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I16, F16, F16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F16, F16, KERNEL_SOURCE_2 ) - TENSOR_GROUPNORM_KERNELS( F16, F16, KERNEL_SOURCE_4 ) - TENSOR_GROUPNORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 ) - TENSOR_GROUPNORM_KERNELS( F16, U8, KERNEL_SOURCE_4 ) - TENSOR_GROUPNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_4 ) + TENSOR_GROUPNORM_SCALE_KERNELS( F16, F16, F16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F16, F16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS( F16, F16, U8, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F16, U8, KERNEL_SOURCE_2 ) - TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, U8, KERNEL_SOURCE_2 ) - TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, U8, KERNEL_SOURCE_2 ) - TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, F16, KERNEL_SOURCE_5 ) - TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, F16, KERNEL_SOURCE_5 ) + TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, U8, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, U8, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, F16, KERNEL_SOURCE_1 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, F16, KERNEL_SOURCE_1 ) - TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, I8, KERNEL_SOURCE_6 ) - TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, I8, KERNEL_SOURCE_6 ) - TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, F16, KERNEL_SOURCE_6 ) - TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, F16, KERNEL_SOURCE_6 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, I8, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, I8, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, F16, KERNEL_SOURCE_0 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, F16, KERNEL_SOURCE_0 ) - TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, I16, KERNEL_SOURCE_7 ) - TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, I16, KERNEL_SOURCE_7 ) - TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, F16, KERNEL_SOURCE_7 ) - TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, F16, KERNEL_SOURCE_7 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, I16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, I16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, F16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, F16, KERNEL_SOURCE_2 ) - TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, U8, KERNEL_SOURCE_8 ) - TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_8 ) - TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_8 ) - TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_8 ) + TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, U8, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_2 ) + TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_2 ) }; /* * Kernel params */ -static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] = +static vx_param_description_t _groupnorm_sums_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -208,9 +187,9 @@ static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; -#define _GROUPNORM_SUM_SQR_PARAM_NUM _cnt_of_array( _groupnorm_sum_sqr_kernel_param_def ) +#define _GROUPNORM_SUMS_PARAM_NUM _cnt_of_array( _groupnorm_sums_kernel_param_def ) -static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] = +static vx_param_description_t _groupnorm_means_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -218,7 +197,7 @@ static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; -#define _GROUPNORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _groupnorm_mean_vari_kernel_param_def ) +#define _GROUPNORM_MEANS_PARAM_NUM _cnt_of_array( _groupnorm_means_kernel_param_def ) static vx_param_description_t _groupnorm_kernel_param_def[] = { @@ -238,7 +217,7 @@ static vx_param_description_t _groupnorm_kernel_param_def[] = /* * Kernel initializer */ -DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) +DEF_KERNEL_INITIALIZER(_groupnorm_sums_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -255,19 +234,17 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; vsi_size_array_t * input_shape = NULL; - float scaleIn = 1; - int32_t input_zp = 0; - vx_uint32 iter = 0; - int32_t sumInZp = 0; - int32_t tmpZp1 = 0; - float tmpZp2 = 0; - float e2InScale = 0; - float rowSumScale = 0; int32_t is2D = 0; int32_t width = 0; int32_t height = 0; int32_t chn = 0; - float in_scale_fl = 1, inFlScale_s2 = 1; + float input_scale = 1; + float input_scale2 = 1; + float input_zp = 1; + float sum_x_tail = 1; + float sum_x2_tail0 = 1; + float sum_x2_tail1 = 1; + float work_item_pixels = 1; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -277,26 +254,10 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &is2D); CHECK_STATUS_FAIL_GOTO(status, OnError ); - input_shape = attr[0]->shape; - - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - inFlScale_s2 = in_scale_fl * in_scale_fl; - } - + input_shape = attr[0]->shape; + input_scale = attr[0]->scale; + input_scale2 = input_scale * input_scale; + input_zp = (float)attr[0]->zero_point; width = (int32_t)(input_shape->data[0]); height = (int32_t)(input_shape->data[1]); chn = (int32_t)(attr[1]->shape->data[1]); @@ -304,16 +265,12 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) { height = 1; } - iter = height * 16; - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - sumInZp = input_zp * iter * (-1); - tmpZp1 = (-2) * input_zp; - e2InScale = scaleIn * scaleIn; - tmpZp2 = input_zp * input_zp * e2InScale; - rowSumScale = height * 16 * tmpZp2; - } + work_item_pixels = (float)height * 16; + + sum_x_tail = -work_item_pixels * input_zp * input_scale; + sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2; + sum_x2_tail1 = -2 * input_zp * input_scale2; shaderParam.global_scale[0] = 1; shaderParam.global_scale[1] = 1; @@ -336,9 +293,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) status = vsi_nn_kernel_gpu_config( node, &shaderParam ); CHECK_STATUS_FAIL_GOTO(status, OnError); - if (attr[0]->dtype == U8) + if (attr[0]->dtype == U8 || attr[0]->dtype == I8) { - gpu_dp_inst_t uniSumU8_16x1 = {{ + gpu_dp_inst_t uniSumX_16x1 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0xfedcba98, // ABin @@ -347,7 +304,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) 0x00002400, // AccumType, ConstantType, and PostShift 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniSqrSum_16x1 = {{ + gpu_dp_inst_t uniSumX2_16x1 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0xfedcba98, // ABin @@ -356,70 +313,33 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) 0x00000400, // AccumType, ConstantType, and PostShift 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); - status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); - status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + status = vsi_nn_kernel_gpu_add_param(node, "uniSumX_16x1", &uniSumX_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX2_16x1", &uniSumX2_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale2", &input_scale2); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x_tail", &sum_x_tail); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail0", &sum_x2_tail0); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1); CHECK_STATUS_FAIL_GOTO(status, OnError ); } - else if (attr[0]->dtype == I8) + else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) { - gpu_dp_inst_t uniSumInt8_16x1 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0xfedcba98, // ABin - 0xaaaaaaaa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniSqrSumInt8_16x1 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0xfedcba98, // ABin - 0x55555555, // BSelt - 0x76543210, 0xfedcba98, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - - status = vsi_nn_kernel_gpu_add_param(node, "uniSumInt8_16x1", &uniSumInt8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSumInt8_16x1", &uniSqrSumInt8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); - status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - else if (attr[0]->dtype == I16) - { - gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ + gpu_dp_inst_t uniSum_X_X2_8x2 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0x76543210, // ABin - 0x5555aaaa, // BSelt - 0x00000000, 0x76543210, // BBin - 0x00000300, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); - status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - else if (attr[0]->dtype == F16) - { - gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0x76543210, // ABin - 0x5555aaaa, // BSelt + 0x0000aaaa, // BSelt 0x00000000, 0x76543210, // BBin 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status = vsi_nn_kernel_gpu_add_param(node, "uniSum_X_X2_8x2", &uniSum_X_X2_8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale2", &input_scale2); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x_tail", &sum_x_tail); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail0", &sum_x2_tail0); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1); CHECK_STATUS_FAIL_GOTO(status, OnError ); } @@ -442,7 +362,7 @@ OnError: return status; } -DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer) +DEF_KERNEL_INITIALIZER(_groupnorm_means_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -523,13 +443,10 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL, NULL, NULL}; vsi_size_array_t * input_shape = NULL; - float scaleIn = 1.0f; - float scaleOut = 1.0f; - float reScaleOut_u8 = 1.0f; - float scale_inOut = 1.0f; - int32_t output_zp = 0; - int32_t input_zp = 0; - float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1; + float input_scale = 1; + float input_zp = 0; + float output_scale = 1.0f; + float output_zp = 0; int32_t height = 0, width = 0, chn = 0; int32_t is2D = 0; @@ -546,49 +463,10 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); input_shape = attr[0]->shape; - - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - input_zp = 0; - } - - if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - output_zp = attr[2]->asymm.zero_point; - scaleOut = attr[2]->asymm.scale; - reScaleOut_u8 = 1 / scaleOut; - } - else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[2]->dfp.fl > 0) - { - out_scale_fl = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - output_zp = 0; - } - - if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - && (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)) - { - inOut_fl_scale = in_scale_fl * out_scale_fl; - } + input_scale = attr[0]->scale; + input_zp = (float)attr[0]->zero_point; + output_scale = 1.0f / attr[2]->scale; + output_zp = (float)attr[2]->zero_point; width = (int32_t)(input_shape->data[0]); height = (int32_t)(input_shape->data[1]); @@ -624,149 +502,65 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError); { - gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + gpu_dp_inst_t uniDataToFP32_0_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00010000, 0x00030002, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertEndInt16Fp32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00050004, 0x00070006, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00050004, 0x00070006, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00090008, 0x000b000a, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x000d000c, 0x000f000e, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt16Fp32Fst_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000300, // AccumType, ConstantType, and PostShift 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt16Fp32Secd_4x4 = {{ + gpu_dp_inst_t uniDataToFP32_1_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00050004, 0x00070006, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000300, // AccumType, ConstantType, and PostShift 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt32toInt16_2x8 = {{ - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertDirUint8Fp32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertEndUint8Fp32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00050004, 0x00070006, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertTrdUint8Fp32_4x4 = {{ + gpu_dp_inst_t uniDataToFP32_2_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00090008, 0x000b000a, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertFthUint8Fp32_4x4 = {{ + gpu_dp_inst_t uniDataToFP32_3_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x000d000c, 0x000f000e, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{ + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ 0x11111111, // TCfg 0x11110000, // ASelt 0x06040200, 0x06040200, // ABin 0x22222222, // BSelt 0x00000000, 0x00000000, // BBin 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; uint32_t pack_key = 0; @@ -775,116 +569,67 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype ); - if (attr[3]->dtype != F32) - { - status = vsi_nn_kernel_gpu_add_param(node, "height", &height); - } - if (!(attr[3]->dtype == F32 && (attr[0]->dtype == I16 || attr[0]->dtype == I8))) - { - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); - } - CHECK_STATUS_FAIL_GOTO(status, OnError ); - switch( pack_key ) { case _PACK_SELECT_KEY( I8, I8 ): + case _PACK_SELECT_KEY( U8, U8 ): + case _PACK_SELECT_KEY( U8, F16 ): case _PACK_SELECT_KEY( I8, F16 ): { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertDirInt8Fp32_4x4", - &uniConvertDirUint8Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt8Fp32_4x4", - &uniConvertEndUint8Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertTrdInt8Fp32_4x4", - &uniConvertTrdUint8Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4", - &uniConvertFthUint8Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); - - status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl); - status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( U8, U8 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", - &uniConvert3rdUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", - &uniConvert4thUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - - status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8); - - scale_inOut = reScaleOut_u8 * scaleIn; - status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( U8, F16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", - &uniConvert3rdUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", - &uniConvert4thUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + if (attr[2]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", + &uniDataToFP32_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", + &uniDataToFP32_1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_2_4x4", + &uniDataToFP32_2_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_3_4x4", + &uniDataToFP32_3_4x4); + if (attr[2]->dtype != F16) + { + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + } CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; case _PACK_SELECT_KEY( I16, I16 ): case _PACK_SELECT_KEY( I16, F16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4", - &uniConvertInt16Fp32Fst_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4", - &uniConvertInt16Fp32Secd_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); - - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8", - &uniConvertInt32toInt16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl); - - status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; case _PACK_SELECT_KEY( F16, F16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4", - &uniConvertEndInt16Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; + case _PACK_SELECT_KEY( F16, I16 ): case _PACK_SELECT_KEY( F16, U8 ): + case _PACK_SELECT_KEY( F16, I8 ): { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4", - &uniConvertEndInt16Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8); + if (attr[2]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", + &uniDataToFP32_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", + &uniDataToFP32_1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; @@ -941,19 +686,19 @@ static vsi_status _query_kernel switch( kernel_id ) { - case INTERNAL_KERNEL_SUM_SQR: - initializer = _groupnorm_sum_sqr_initializer; - kernel_map = _groupnorm_sum_sqr_kernel_map; - kernel_map_size = _cnt_of_array( _groupnorm_sum_sqr_kernel_map ); - param_def = _groupnorm_sum_sqr_kernel_param_def; - param_size = _GROUPNORM_SUM_SQR_PARAM_NUM; + case INTERNAL_KERNEL_SUMS: + initializer = _groupnorm_sums_initializer; + kernel_map = _groupnorm_sums_kernel_map; + kernel_map_size = _cnt_of_array( _groupnorm_sums_kernel_map ); + param_def = _groupnorm_sums_kernel_param_def; + param_size = _GROUPNORM_SUMS_PARAM_NUM; break; - case INTERNAL_KERNEL_MEAN_VARI: - initializer = _groupnorm_mean_vari_initializer; + case INTERNAL_KERNEL_MEANS: + initializer = _groupnorm_means_initializer; kernel_map = _groupnorm_mean_vari_kernel_map; kernel_map_size = _cnt_of_array( _groupnorm_mean_vari_kernel_map ); - param_def = _groupnorm_mean_vari_kernel_param_def; - param_size = _GROUPNORM_MEAN_VARI_PARAM_NUM; + param_def = _groupnorm_means_kernel_param_def; + param_size = _GROUPNORM_MEANS_PARAM_NUM; break; case INTERNAL_KERNEL_NORM: initializer = _groupnorm_initializer; @@ -1008,8 +753,8 @@ static vsi_nn_kernel_node_t _setup #define SUM_SQR_INDEX (0) #define MEAN_VARI_INDEX (1) vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t sum_sqr_node_params[_GROUPNORM_SUM_SQR_PARAM_NUM] = { NULL }; - vsi_nn_kernel_node_param_t mean_vari_node_params[_GROUPNORM_MEAN_VARI_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t sums_node_params[_GROUPNORM_SUMS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t means_node_params[_GROUPNORM_MEANS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_param_t node_params[_GROUPNORM_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL; vsi_nn_kernel_node_t node = NULL; @@ -1026,9 +771,9 @@ static vsi_nn_kernel_node_t _setup uint32_t hashkey = 0; int32_t i = 0; float rSpaceOrg = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1]); - float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); - int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" ); - vsi_size_t group_size = inputs[0]->attr.size[2] / group_num; + float eps = vsi_nn_kernel_param_get_float32(params, "eps"); + int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" ); + vsi_size_t group_size = inputs[0]->attr.size[2] / group_num; float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size); // Check if gpu can support the size @@ -1038,7 +783,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - status = vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size, + status = vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, group_num, 0, new_shape); if ( VSI_SUCCESS != status ) { @@ -1048,7 +793,7 @@ static vsi_nn_kernel_node_t _setup rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4); rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4); - for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) { ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); // Assign unique_id @@ -1059,16 +804,16 @@ static vsi_nn_kernel_node_t _setup in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg ); - hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 ); + hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUMS_KEY( in0_dtype, F32, is2D_flg ); + hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEANS_KEY( F32, F32 ); hashkey = HASH_GROUPNORM_KEY( in0_dtype, in2_dtype, out_dtype, is2D_flg ); - status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR ); + status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUMS ); if ( VSI_SUCCESS != status ) { goto final; } - status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); + status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEANS ); if ( VSI_SUCCESS != status ) { goto final; @@ -1103,26 +848,21 @@ static vsi_nn_kernel_node_t _setup if (tmp_node) { uint32_t index = 0; - sum_sqr_node_params[index++] = rs_input; - sum_sqr_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t; - sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); - sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg ); + sums_node_params[index++] = rs_input; + sums_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t; + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg ); - status = vsi_nn_kernel_node_pass_param( tmp_node, sum_sqr_node_params, - _GROUPNORM_SUM_SQR_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( tmp_node, sums_node_params, + _GROUPNORM_SUMS_PARAM_NUM ); CHECK_STATUS(status); - vsi_nn_kernel_scalar_release( &sum_sqr_node_params[2] ); - vsi_nn_kernel_scalar_release( &sum_sqr_node_params[3] ); + vsi_nn_kernel_scalar_release( &sums_node_params[2] ); + vsi_nn_kernel_scalar_release( &sums_node_params[3] ); { // Set default border mode. vx_border_t border; border.mode = VX_BORDER_CONSTANT; - border.constant_value.U8 = 0; - border.constant_value.U16 = 0; - if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) - { - border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); - } + vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype); status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); } @@ -1133,26 +873,21 @@ static vsi_nn_kernel_node_t _setup if (tmp_node1) { uint32_t index = 0; - mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t; - mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio ); + means_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t; + means_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; + means_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + means_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio ); - status = vsi_nn_kernel_node_pass_param( tmp_node1, mean_vari_node_params, - _GROUPNORM_MEAN_VARI_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( tmp_node1, means_node_params, + _GROUPNORM_MEANS_PARAM_NUM ); CHECK_STATUS(status); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] ); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] ); + vsi_nn_kernel_scalar_release( &means_node_params[2] ); + vsi_nn_kernel_scalar_release( &means_node_params[3] ); { // Set default border mode. vx_border_t border; border.mode = VX_BORDER_CONSTANT; - border.constant_value.U8 = 0; border.constant_value.U16 = 0; - if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) - { - border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); - } status = vxSetNodeAttribute( (vx_node)tmp_node1, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); } @@ -1186,19 +921,6 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_release( &node_params[6] ); vsi_nn_kernel_scalar_release( &node_params[7] ); vsi_nn_kernel_scalar_release( &node_params[8] ); - { - // Set default border mode. - vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U8 = 0; - border.constant_value.U16 = 0; - if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) - { - border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); - } - status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); - CHECK_STATUS(status); - } } /* Pass parameters to node. */ diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c index 69057be..510069b 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c @@ -44,7 +44,7 @@ __BEGIN_DECLS typedef enum _grucell_nn_activation_type_e { SIGMOID = VSI_NN_ACT_SIGMOID, - HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, + HSIGMOID = VSI_NN_ACT_HARD_SIGMOID, }grucell_nn_activation_type_e; #define _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE "grucell_activation_z_h" @@ -72,6 +72,10 @@ static const _kernel_map_type _grucell_activation_z_h_kernel_map[] = PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ), PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ), PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( U8, F16, U8, HSIGMOID ), + PACK_KERNEL_MAP( I8, F16, I8, HSIGMOID ), + PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ), }; /* diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c index 5ba28e6..8522000 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -45,7 +44,7 @@ __BEGIN_DECLS typedef enum _grucell_nn_activation_type_e { SIGMOID = VSI_NN_ACT_SIGMOID, - HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID, + HSIGMOID = VSI_NN_ACT_HARD_SIGMOID, }grucell_nn_activation_type_e; #define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE "grucell_h_times_activation_r" @@ -72,9 +71,12 @@ static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] = PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ), PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ), PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ), + PACK_KERNEL_MAP( U8, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( I8, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ), + PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ), }; - /* * Kernel params */ @@ -256,8 +258,6 @@ final: return status; } /* _grucell_h_times_activation_r_initializer() */ - - /* * Query kernel */ @@ -313,7 +313,6 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ - static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c index 4f3367e..f641e10 100644 --- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c @@ -35,7 +35,8 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_dtype_util.h" __BEGIN_DECLS @@ -45,67 +46,46 @@ __BEGIN_DECLS typedef enum { - INTERNAL_KERNEL_MEAN_VARI, + INTERNAL_KERNEL_SUMS, INTERNAL_KERNEL_NORM, } _internal_kernel_e; -#define KERNEL_SOURCE_1 "instance_normalization_i8" -#define KERNEL_SOURCE_2 "instance_normalization_u8" -#define KERNEL_SOURCE_3 "instance_normalization_i16" -#define KERNEL_SOURCE_4 "instance_normalization_f16" -#define KERNEL_SOURCE_5 "instance_normalization_u8_f16" -#define KERNEL_SOURCE_6 "instance_normalization_scale_f32" -#define KERNEL_SOURCE_7 "instance_normalization_scale_f32_f16" -#define KERNEL_SOURCE_8 "instance_normalization_scale_f32_bf16" +#define KERNEL_SOURCE_0 "instance_normalization_0" +#define KERNEL_SOURCE_1 "instance_normalization_1" +#define KERNEL_SOURCE_2 "instance_normalization_2" +#define KERNEL_SOURCE_3 "instance_normalization_3" -#define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(SRC0_TYPE) \ - CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE) +#define HASH_INSTANCENORM_SUMS_SH_KERNEL_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.instance_norm_sums_"#SRC0_TYPE) -#define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_2D_NAME(SRC0_TYPE) \ - CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE"_2D") - -#define HASH_INSTANCENORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE) - -#define HASH_INSTANCENORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") +#define HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(SRC0_TYPE) \ + CVIVANTE_NAMESPACE("evis.instance_norm_sums_"#SRC0_TYPE"_2D") #define HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE) + CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE) #define HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D") + CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE"_2D") // Add kernel hashtable here -// mean vari -#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \ +#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \ ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8)) -#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 0), \ - HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(IN0_TYPE), \ +#define TENSOR_INSTANCENORM_SUMS_KERNELS_3D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \ + HASH_INSTANCENORM_SUMS_SH_KERNEL_NAME(IN0_TYPE), \ SOURCE }, -#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 1), \ - HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_2D_NAME(IN0_TYPE), \ +#define TENSOR_INSTANCENORM_SUMS_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ + { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 1), \ + HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(IN0_TYPE), \ SOURCE }, // normalization #define HASH_INSTANCENORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \ ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4)) -#define TENSOR_INSTANCENORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \ - HASH_INSTANCENORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ - SOURCE }, - -#define TENSOR_INSTANCENORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \ - HASH_INSTANCENORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ - SOURCE }, - -#define TENSOR_INSTANCENORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ +#define TENSOR_INSTANCENORM_SCALE_KERNELS_3D(IN0_TYPE, OUT_TYPE, SOURCE) \ { HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 0), \ HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, @@ -122,62 +102,57 @@ typedef struct const char * source_name; } _kernel_map_type; -static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] = +static const _kernel_map_type _instancenorm_sums_kernel_map[] = { // Register kernel here - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I8, F32, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I8, F32, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( U8, F32, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( U8, F32, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I16, F32, KERNEL_SOURCE_3 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_4 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( BF16, F32, KERNEL_SOURCE_8 ) - TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( BF16, F32, KERNEL_SOURCE_8 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_3D( I8, F32, KERNEL_SOURCE_0 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_2D( I8, F32, KERNEL_SOURCE_0 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_3D( U8, F32, KERNEL_SOURCE_0 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_0 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_3D( I16, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_2D( I16, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_3D( F16, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_3D( BF16, F32, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_SUMS_KERNELS_2D( BF16, F32, KERNEL_SOURCE_3 ) }; static const _kernel_map_type _instancenorm_kernel_map[] = { // Register kernel here - TENSOR_INSTANCENORM_KERNELS( I8, I8, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_KERNELS_2D( I8, I8, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_KERNELS( I8, F16, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_KERNELS_2D( I8, F16, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_KERNELS( U8, U8, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_5 ) - TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( U8, U8, KERNEL_SOURCE_0 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_0 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( I8, I8, KERNEL_SOURCE_0 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_0 ) - TENSOR_INSTANCENORM_KERNELS( I16, I16, KERNEL_SOURCE_3 ) - TENSOR_INSTANCENORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 ) - TENSOR_INSTANCENORM_KERNELS( I16, F16, KERNEL_SOURCE_3 ) - TENSOR_INSTANCENORM_KERNELS_2D( I16, F16, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( U8, F16, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8, F16, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( I8, F16, KERNEL_SOURCE_1 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8, F16, KERNEL_SOURCE_1 ) - TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_4 ) - TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( I16, I16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( F16, F16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_6 ) - TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_6 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( I16, F16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, F16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( F16, I16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, I16, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( F16, I8, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, I8, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( F16, U8, KERNEL_SOURCE_2 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, U8, KERNEL_SOURCE_2 ) - TENSOR_INSTANCENORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_6 ) - TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_6 ) - - TENSOR_INSTANCENORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_6 ) - TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 ) - - TENSOR_INSTANCENORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_7 ) - TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_7 ) - - TENSOR_INSTANCENORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_8 ) - TENSOR_INSTANCENORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_8 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_3D( BF16, BF16, KERNEL_SOURCE_3 ) + TENSOR_INSTANCENORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_3 ) }; /* * Kernel params */ -static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] = +static vx_param_description_t _instancenorm_sums_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -185,7 +160,7 @@ static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; -#define _INSTANCENORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _instancenorm_mean_vari_kernel_param_def ) +#define _INSTANCENORM_SUMS_PARAM_NUM _cnt_of_array( _instancenorm_sums_kernel_param_def ) static vx_param_description_t _instancenorm_kernel_param_def[] = { @@ -203,7 +178,7 @@ static vx_param_description_t _instancenorm_kernel_param_def[] = /* * Kernel initializer */ -DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) +DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -220,65 +195,44 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; vsi_size_array_t * input_shape = NULL; - float scaleIn = 1; - int32_t input_zp = 0; - vx_uint32 iter = 0; - int32_t sumInZp = 0; - int32_t tmpZp1 = 0; - float tmpZp2 = 0; - float e2InScale = 0; - float rowSumScale = 0; - int32_t rsFlg = 0; + int32_t rs_flag = 0; int32_t width = 0; int32_t height = 0; int32_t chn = 0; - float in_scale_fl = 1, inFlScale_s2 = 1; + float input_scale = 1; + float input_scale2 = 1; + float input_zp = 1; + float sum_x_tail = 1; + float sum_x2_tail0 = 1; + float sum_x2_tail1 = 1; + float work_item_pixels = 1; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &rsFlg); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &rs_flag); CHECK_STATUS_FAIL_GOTO(status, OnError ); - input_shape = attr[0]->shape; - - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - inFlScale_s2 = in_scale_fl * in_scale_fl; - } + input_shape = attr[0]->shape; + input_scale = attr[0]->scale; + input_scale2 = input_scale * input_scale; + input_zp = (float)attr[0]->zero_point; width = (int32_t)(input_shape->data[0]); height = (int32_t)(input_shape->data[1]); chn = (int32_t)(attr[1]->shape->data[1]); - if (rsFlg) + if (rs_flag) { height = height / chn; } - iter = height * 16; - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - sumInZp = input_zp * iter * (-1); - tmpZp1 = (-2) * input_zp; - e2InScale = scaleIn * scaleIn; - tmpZp2 = input_zp * input_zp * e2InScale; - rowSumScale = height * 16 * tmpZp2; - } + work_item_pixels = (float)height * 16; + + sum_x_tail = -work_item_pixels * input_zp * input_scale; + sum_x2_tail0 = work_item_pixels * input_zp * input_zp * input_scale2; + sum_x2_tail1 = -2 * input_zp * input_scale2; shaderParam.global_scale[0] = 1; shaderParam.global_scale[1] = 1; @@ -301,9 +255,9 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) status = vsi_nn_kernel_gpu_config( node, &shaderParam ); CHECK_STATUS_FAIL_GOTO(status, OnError); - if (attr[0]->dtype == U8) + if (attr[0]->dtype == U8 || attr[0]->dtype == I8) { - gpu_dp_inst_t uniSumU8_16x1 = {{ + gpu_dp_inst_t uniSumX_16x1 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0xfedcba98, // ABin @@ -312,36 +266,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) 0x00002400, // AccumType, ConstantType, and PostShift 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniSqrSum_16x1 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0xfedcba98, // ABin - 0x55555555, // BSelt - 0x76543210, 0xfedcba98, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); - status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); - status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - else if (attr[0]->dtype == I8) - { - gpu_dp_inst_t uniSumInt8_16x1 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0xfedcba98, // ABin - 0xaaaaaaaa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniSqrSumInt8_16x1 = {{ + gpu_dp_inst_t uniSumX2_16x1 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0xfedcba98, // ABin @@ -351,40 +276,33 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniSumInt8_16x1", &uniSumInt8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSumInt8_16x1", &uniSqrSumInt8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); - status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); + status = vsi_nn_kernel_gpu_add_param(node, "uniSumX_16x1", &uniSumX_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX2_16x1", &uniSumX2_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale2", &input_scale2); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x_tail", &sum_x_tail); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail0", &sum_x2_tail0); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1); CHECK_STATUS_FAIL_GOTO(status, OnError ); } - else if (attr[0]->dtype == I16) + else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) { - gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ + gpu_dp_inst_t uniSum_X_X2_8x2 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0x76543210, // ABin - 0x5555aaaa, // BSelt - 0x00000000, 0x76543210, // BBin - 0x00000300, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl); - status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - else if (attr[0]->dtype == F16) - { - gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0x76543210, // ABin - 0x5555aaaa, // BSelt + 0x0000aaaa, // BSelt 0x00000000, 0x76543210, // BBin 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + status = vsi_nn_kernel_gpu_add_param(node, "uniSum_X_X2_8x2", &uniSum_X_X2_8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale2", &input_scale2); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x_tail", &sum_x_tail); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail0", &sum_x2_tail0); + status |= vsi_nn_kernel_gpu_add_param(node, "sum_x2_tail1", &sum_x2_tail1); CHECK_STATUS_FAIL_GOTO(status, OnError ); } else if (attr[0]->dtype == BF16) @@ -450,15 +368,14 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL}; vsi_size_array_t * input_shape = NULL; - float scaleIn = 1.0f; - float scaleOut = 1.0f; - float scale_inOut = 1.0f; - int32_t output_zp = 0; - int32_t input_zp = 0; - float dimRatio = 0; + float input_scale = 1; + float output_scale = 1; + float input_zp = 0; + float output_zp = 0; + float inv_multiplier = 0; vx_uint32 group_num = 0; vx_int32 height = 0, width = 0, chn = 0; - int32_t rsFlg = 0; + int32_t rs_flag = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -469,59 +386,24 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] ); CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError ); - status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rsFlg); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rs_flag); CHECK_STATUS_FAIL_GOTO(status, OnError ); input_shape = attr[0]->shape; - - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - input_zp = 0; - } - - if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - output_zp = attr[3]->asymm.zero_point; - scaleOut = attr[3]->asymm.scale; - scaleOut = 1 / scaleOut; - } - else if (attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[3]->dfp.fl > 0) - { - scaleOut = (float)((int64_t)1 << attr[3]->dfp.fl); - } - else - { - scaleOut = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl)); - } - output_zp = 0; - } - - scale_inOut = scaleIn * scaleOut; + input_scale = attr[0]->scale; + input_zp = (float)attr[0]->zero_point; + output_scale = 1.0f / attr[3]->scale; + output_zp = (float)attr[3]->zero_point; width = (int32_t)(input_shape->data[0]); height = (int32_t)(input_shape->data[1]); chn = (int32_t)(attr[2]->shape->data[1]); - if (rsFlg) + if (rs_flag) { height = height / chn; } - dimRatio = (float)(1.0 / (width * height)); + inv_multiplier = (float)(1.0 / (width * height)); group_num = (width + 255) / 256; @@ -544,151 +426,66 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError); { - gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + gpu_dp_inst_t uniDataToFP32_0_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00010000, 0x00030002, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertEndInt16Fp32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00050004, 0x00070006, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00050004, 0x00070006, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00090008, 0x000b000a, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x000d000c, 0x000f000e, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt16Fp32Fst_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000300, // AccumType, ConstantType, and PostShift 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt16Fp32Secd_4x4 = {{ + gpu_dp_inst_t uniDataToFP32_1_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00050004, 0x00070006, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000300, // AccumType, ConstantType, and PostShift 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt32toInt16_2x8 = {{ - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertDirUint8Fp32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertEndUint8Fp32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00050004, 0x00070006, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertTrdUint8Fp32_4x4 = {{ + gpu_dp_inst_t uniDataToFP32_2_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00090008, 0x000b000a, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertFthUint8Fp32_4x4 = {{ + gpu_dp_inst_t uniDataToFP32_3_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x000d000c, 0x000f000e, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{ + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ 0x11111111, // TCfg 0x11110000, // ASelt 0x06040200, 0x06040200, // ABin 0x22222222, // BSelt 0x00000000, 0x00000000, // BBin 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ 0x11111111, // TCfg 0x01010101, // ASelt @@ -721,151 +518,77 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) }, GPU_DP_TYPE_16}; uint32_t pack_key = 0; -#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ - (IN0_TYPE | (IN1_TYPE << 8) | (OUT_TYPE << 16)) +#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ + (IN0_TYPE | (OUT_TYPE << 16)) - pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[3]->dtype ); + pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[3]->dtype ); status = vsi_nn_kernel_gpu_add_param(node, "height", &height); - status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); + status |= vsi_nn_kernel_gpu_add_param(node, "inv_multiplier", &inv_multiplier); status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num); CHECK_STATUS_FAIL_GOTO(status, OnError ); switch( pack_key ) { - case _PACK_SELECT_KEY( I8, F16, I8 ): - case _PACK_SELECT_KEY( I8, F16, F16 ): + case _PACK_SELECT_KEY( U8, F16 ): + case _PACK_SELECT_KEY( I8, F16 ): + case _PACK_SELECT_KEY( U8, U8 ): + case _PACK_SELECT_KEY( I8, I8 ): { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertDirInt8Fp32_4x4", - &uniConvertDirUint8Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt8Fp32_4x4", - &uniConvertEndUint8Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertTrdInt8Fp32_4x4", - &uniConvertTrdUint8Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4", - &uniConvertFthUint8Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn); - - status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut); - status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut); + if (attr[3]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractInteger_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + } + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", + &uniDataToFP32_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", + &uniDataToFP32_1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_2_4x4", + &uniDataToFP32_2_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_3_4x4", + &uniDataToFP32_3_4x4); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( U8, F16, U8 ): + case _PACK_SELECT_KEY( I16, F16 ): + case _PACK_SELECT_KEY( F16, F16 ): + case _PACK_SELECT_KEY( I16, I16 ): + case _PACK_SELECT_KEY( F16, I16 ): + case _PACK_SELECT_KEY( F16, U8 ): + case _PACK_SELECT_KEY( F16, I8 ): { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", - &uniConvert3rdUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", - &uniConvert4thUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - - status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); - status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut); + if (attr[3]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", + &uniDataToFP32_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", + &uniDataToFP32_1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( U8, F32, U8 ): - case _PACK_SELECT_KEY( I8, F32, I8 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", - &uniConvert3rdUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", - &uniConvert4thUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - - status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); - status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( U8, F16, F16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", - &uniConvert3rdUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", - &uniConvert4thUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( I16, F16, I16 ): - case _PACK_SELECT_KEY( I16, F16, F16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4", - &uniConvertInt16Fp32Fst_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4", - &uniConvertInt16Fp32Secd_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn); - - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8", - &uniConvertInt32toInt16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut); - status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( I16, F32, I16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4", - &uniConvertInt16Fp32Fst_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4", - &uniConvertInt16Fp32Secd_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8", - &uniConvertInt32toInt16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut); - status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( F16, F16, F16 ): - case _PACK_SELECT_KEY( F16, F32, F16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4", - &uniConvertEndInt16Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( BF16, F32, BF16 ): + case _PACK_SELECT_KEY( BF16, BF16 ): { status = vsi_nn_kernel_gpu_add_param( node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); @@ -927,14 +650,14 @@ static vsi_status _query_kernel size_t param_size = 0; uint32_t i = 0; - switch( kernel_id ) + switch ( kernel_id ) { - case INTERNAL_KERNEL_MEAN_VARI: - initializer = _instancenorm_mean_vari_initializer; - kernel_map = _instancenorm_mean_vari_kernel_map; - kernel_map_size = _cnt_of_array( _instancenorm_mean_vari_kernel_map ); - param_def = _instancenorm_mean_vari_kernel_param_def; - param_size = _INSTANCENORM_MEAN_VARI_PARAM_NUM; + case INTERNAL_KERNEL_SUMS: + initializer = _instancenorm_sums_initializer; + kernel_map = _instancenorm_sums_kernel_map; + kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map ); + param_def = _instancenorm_sums_kernel_param_def; + param_size = _INSTANCENORM_SUMS_PARAM_NUM; break; case INTERNAL_KERNEL_NORM: initializer = _instancenorm_initializer; @@ -948,7 +671,7 @@ static vsi_status _query_kernel return VSI_FAILURE; } - for( i = 0; i < kernel_map_size; i ++ ) + for ( i = 0; i < kernel_map_size; i ++ ) { if ( kernel_map[i].key == hashkey ) { @@ -989,7 +712,7 @@ static vsi_nn_kernel_node_t _setup #define INTERNAL_KERNEL_SIZE (1) #define MEAN_VARI_INDEX (0) vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t mean_vari_node_params[_INSTANCENORM_MEAN_VARI_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t tmp_node = NULL; vsi_nn_kernel_node_t node = NULL; @@ -1004,14 +727,53 @@ static vsi_nn_kernel_node_t _setup uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; uint32_t hashkey = 0; int32_t i = 0; + int32_t axis[VSI_NN_MAX_DIM_NUM] = {0, 1}; + int32_t axis_num = 2; + int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1 }; + uint32_t axis_size = 0; uint32_t rank = outputs[0]->attr.dim_num; + vsi_nn_tensor_t *reshape_tensor[2] = {NULL}; float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); - int32_t reshape_flg = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH - && rank > 2; + int32_t reshape_flg = 0; + vsi_size_t batch = 1; + vsi_bool ret = FALSE; + ret = vsi_nn_kernel_optimize_tensor_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + axis, axis_num, new_shape, &rank, new_axis, &axis_size); + if ( ret == FALSE || axis_size > 2 ) + { + return NULL; + } + + for (i = 3; i < (int32_t)inputs[0]->attr.dim_num; i++) + { + batch *= inputs[0]->attr.size[i]; + } + + if (axis_size == 1) + { + for (i = rank; i > 1; i--) + { + new_shape[i] = new_shape[i - 1]; + } + new_shape[1] = 1; + rank ++; + } + new_shape[2] = rank == 2 ? 1 : new_shape[2] / batch; + new_shape[3] = batch; + rank = 4; + + reshape_tensor[0] = vsi_nn_reshape_tensor( graph, + inputs[0], new_shape, rank ); + reshape_tensor[1] = vsi_nn_reshape_tensor( graph, + outputs[0], new_shape, rank ); + + reshape_flg = rank > 2 && new_shape[1] * new_shape[2] < GPU_TENSOR_MAX_WIDTH; // Check if gpu can support the size if ( !vsi_nn_kernel_gpu_check_shape( - outputs[0]->attr.size, outputs[0]->attr.dim_num ) || + reshape_tensor[1]->attr.size, reshape_tensor[1]->attr.dim_num ) || rank > 4 ) { return NULL; @@ -1024,14 +786,15 @@ static vsi_nn_kernel_node_t _setup ikernels[i]->unique_id = kernel->unique_id; } - in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in0_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[0]->attr.dtype.vx_type ); in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); - out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[1]->attr.dtype.vx_type ); + in1_dtype = in1_dtype == F16 ? F32 : in1_dtype; - hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg ); + hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg ); hashkey = HASH_INSTANCENORM_KEY( in0_dtype, in1_dtype, out_dtype, reshape_flg ); - status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI ); + status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_SUMS ); if ( VSI_SUCCESS != status ) { goto final; @@ -1044,34 +807,27 @@ static vsi_nn_kernel_node_t _setup if (reshape_flg) { - shape[0] = inputs[0]->attr.size[0]; - shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2]; + shape[0] = new_shape[0]; + shape[1] = new_shape[1] * new_shape[2]; shape[2] = 1; - shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); - rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); + shape[3] = reshape_tensor[0]->attr.dim_num > 3 ? new_shape[3] : 1; + rs_input = vsi_nn_kernel_tensor_reshape( reshape_tensor[0]->t, shape, 4 ); + rs_output = vsi_nn_kernel_tensor_reshape( reshape_tensor[1]->t, shape, 4 ); } - else if (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH) + else if (new_shape[0] < new_shape[1]) { - shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; - shape[1] = 1; - shape[2] = inputs[0]->attr.size[2]; - shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); - rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); - } - else if (inputs[0]->attr.size[0] < inputs[0]->attr.size[1]) - { - shape[0] = inputs[0]->attr.size[1]; - shape[1] = inputs[0]->attr.size[0]; - shape[2] = inputs[0]->attr.size[2]; - shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); - rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); + shape[0] = new_shape[1]; + shape[1] = new_shape[0]; + shape[2] = new_shape[2]; + shape[3] = inputs[0]->attr.dim_num > 3 ? new_shape[3] : 1; + rs_input = vsi_nn_kernel_tensor_reshape( reshape_tensor[0]->t, shape, 4 ); + rs_output = vsi_nn_kernel_tensor_reshape( reshape_tensor[1]->t, shape, 4 ); } else { - shape[0] = inputs[0]->attr.size[0]; + shape[0] = new_shape[0]; + rs_input = vsi_nn_kernel_tensor_reshape( reshape_tensor[0]->t, new_shape, rank ); + rs_output = vsi_nn_kernel_tensor_reshape( reshape_tensor[1]->t, new_shape, rank ); } memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); @@ -1091,58 +847,37 @@ static vsi_nn_kernel_node_t _setup attr.dim_num = 4; tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr ); - if (inputs[1]->attr.dim_num < 2) - { - shape[0] = inputs[1]->attr.size[0]; - shape[1] = 1; - shape[2] = 1; - shape[3] = 1; - rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 ); - } - if (inputs[2]->attr.dim_num < 2) - { - shape[0] = inputs[2]->attr.size[0]; - shape[1] = 1; - shape[2] = 1; - shape[3] = 1; - rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 ); - } + shape[0] = 1; + shape[1] = rank > 2 ? new_shape[2] : 1; + rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 2 ); + rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 2 ); + // Mean Vari { tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] ); if (tmp_node) { uint32_t index = 0; - if (rs_input) - { - mean_vari_node_params[index++] = rs_input; - vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index], - _INSTANCENORM_MEAN_VARI_PARAM_NUM, NULL, 0, tensors, 1 ); - } - else - { - vsi_nn_kernel_node_pack_io( mean_vari_node_params, - _INSTANCENORM_MEAN_VARI_PARAM_NUM, inputs, 1, tensors, 1 ); - } - index = 2; - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); - mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); - status = vsi_nn_kernel_node_pass_param( tmp_node, mean_vari_node_params, - _INSTANCENORM_MEAN_VARI_PARAM_NUM ); + sums_node_params[index++] = rs_input; + vsi_nn_kernel_node_pack_io( &sums_node_params[index], + _INSTANCENORM_SUMS_PARAM_NUM, NULL, 0, tensors, 1 ); + index = 2; + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); + + status = vsi_nn_kernel_node_pass_param( tmp_node, sums_node_params, + _INSTANCENORM_SUMS_PARAM_NUM ); CHECK_STATUS(status); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] ); - vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] ); + vsi_nn_kernel_scalar_release( &sums_node_params[2] ); + vsi_nn_kernel_scalar_release( &sums_node_params[3] ); { // Set default border mode. vx_border_t border; border.mode = VX_BORDER_CONSTANT; - border.constant_value.U8 = 0; - border.constant_value.U16 = 0; - if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) - { - border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); - } + + vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype); + status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); } @@ -1155,39 +890,11 @@ static vsi_nn_kernel_node_t _setup if (node) { uint32_t index = 0; - if (rs_input) - { - node_params[index++] = rs_input; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; - } - if (inputs[1]->attr.dim_num < 2) - { - node_params[index++] = rs_beta; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; - } - if (inputs[2]->attr.dim_num < 2) - { - node_params[index++] = rs_gamma; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; - } + node_params[index++] = rs_input; + node_params[index++] = rs_beta; + node_params[index++] = rs_gamma; node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t; - if (rs_output) - { - node_params[index++] = rs_output; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t; - } + node_params[index++] = rs_output; node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg ); @@ -1196,24 +903,13 @@ static vsi_nn_kernel_node_t _setup CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &node_params[5] ); vsi_nn_kernel_scalar_release( &node_params[6] ); - { - // Set default border mode. - vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U8 = 0; - border.constant_value.U16 = 0; - if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) - { - border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); - } - status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); - CHECK_STATUS(status); - } } } /* Pass parameters to node. */ final: + vsi_safe_release_tensor(reshape_tensor[0]); + vsi_safe_release_tensor(reshape_tensor[1]); if (rs_beta) { vsi_nn_kernel_tensor_release( &rs_beta ); @@ -1230,16 +926,13 @@ final: { vsi_nn_kernel_tensor_release( &rs_output ); } - for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) + for ( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ ) { if ( ikernels[i] ) { vsi_nn_kernel_release( &ikernels[i] ); } - if ( tensors[i] ) - { - vsi_nn_ReleaseTensor( &tensors[i] ); - } + vsi_safe_release_tensor(tensors[i]); } if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} return node; diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c index 34c51f8..be4a299 100644 --- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c @@ -38,16 +38,24 @@ __BEGIN_DECLS + #define HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d) \ ((AXIS << 28) | (IN1_DTYPE << 20) | (IN0_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d)) - #define HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) \ - "l2normalizescale_axis"#AXIS +#define KERNEL_SOURCE_1 "l2normalizescale_axis0" +#define KERNEL_SOURCE_2 "l2normalizescale_axis0_2d" +#define KERNEL_SOURCE_3 "l2normalizescale_axis1" -#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ + +#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \ { HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \ CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D"), \ - HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) }, + SOURCE }, + +#define HASH_L2NORMALIZESCALE_KERNELS( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \ + { HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \ + CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE), \ + SOURCE }, typedef struct { @@ -58,20 +66,27 @@ typedef struct static const _kernel_map_type _l2normalizescale_kernel_map[] = { - HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16 ) - HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16, KERNEL_SOURCE_2 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8, KERNEL_SOURCE_2 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16, KERNEL_SOURCE_2 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8, KERNEL_SOURCE_2 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16, KERNEL_SOURCE_2 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16, KERNEL_SOURCE_2 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16, KERNEL_SOURCE_2 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16, KERNEL_SOURCE_3 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8, KERNEL_SOURCE_3 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16, KERNEL_SOURCE_3 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8, KERNEL_SOURCE_3 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16, KERNEL_SOURCE_3 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16, KERNEL_SOURCE_3 ) + HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16, KERNEL_SOURCE_3 ) + HASH_L2NORMALIZESCALE_KERNELS( 0, F16, F16, F16, KERNEL_SOURCE_1 ) + HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, I8, KERNEL_SOURCE_1 ) + HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, F16, KERNEL_SOURCE_1 ) + HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, U8, KERNEL_SOURCE_1 ) + HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, F16, KERNEL_SOURCE_1 ) + HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, I16, KERNEL_SOURCE_1 ) + HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, F16, KERNEL_SOURCE_1 ) }; /* @@ -119,6 +134,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) int32_t outputZP = 0; float outputScale = 1.0f; float r_inputScale = 1.0f; + float e2InScale = 1.0f; + float inOutScale = 1.0f; + int32_t axis2Dflg = 0; + int32_t inputWidth = 0; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); @@ -168,7 +187,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) outputScale = 1.0f / output_attr->asymm.scale; } + e2InScale = inputScale * inputScale; r_inputScale = 1.0f / inputScale; + inOutScale = inputScale * outputScale; + inputWidth = (int32_t)(output_shape->data[0]); if (1 == axis) { @@ -190,6 +212,13 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) gpu_param.local_size[1] = 1; gpu_param.global_size[0] = 16; gpu_param.global_size[1] = output_shape->data[1]; + + if (output_shape->data[0] < GPU_TENSOR_MAX_WIDTH + && output_shape->data[1] < GPU_TENSOR_MAX_WIDTH + && (output_shape->size == 2 || (output_shape->size == 3 && output_shape->data[2] == 1))) + { + axis2Dflg = 1; + } } else { @@ -257,8 +286,105 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) 0x00000400, // AccumType, ConstantType, and PostShift 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00010000, 0x00030002, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{ + 0x05050505, // TCfg + 0x04040404, // ASelt + 0x00050004, 0x00070006, // ABin + 0x0a0a0a0a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0x76543210, // ABin + 0x5555aaaa, // BSelt + 0x00000000, 0x76543210, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; - if (1 == axis) + if (axis2Dflg) + { + float zP2x = 2 * (float)inputZP; + float zpSqr8x = 8 * (float)inputZP * (float)inputZP; + float output_ZP = (float)outputZP; + status = vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth); + status |= vsi_nn_kernel_gpu_add_param( node, "zP2x", &zP2x); + status |= vsi_nn_kernel_gpu_add_param( node, "zpSqr8x", &zpSqr8x); + status |= vsi_nn_kernel_gpu_add_param( node, "e2InScale", &e2InScale); + status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP); + status |= vsi_nn_kernel_gpu_add_param( node, "inputZP", &inputZP); + status |= vsi_nn_kernel_gpu_add_param( node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", + &uniConvert1stUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", + &uniConvert2ndUint8SubZpToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (1 == axis) { int32_t L2NorS_depth = (int32_t)(output_shape->data[1]); status = vsi_nn_kernel_gpu_add_param( node, "L2NorS_depth", &L2NorS_depth); @@ -277,8 +403,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) } else if (0 == axis) { - int32_t inputWidth, inputWidthCount, inputWidthRemain256; - inputWidth = (int32_t)(output_shape->data[0]); + int32_t inputWidthCount, inputWidthRemain256; inputWidthRemain256 = (int32_t)(output_shape->data[0] % 256); inputWidthCount = (int32_t)(output_shape->data[0] / 256); vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth); @@ -298,7 +423,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) } } - { + if (axis2Dflg == 0) + { float IntergerScale = inputScale; float output_ZP = (float)outputZP; gpu_dp_inst_t uniExtact8Bin_2x8 = {{ @@ -473,7 +599,8 @@ static vsi_nn_kernel_node_t _setup return NULL; } - image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1); + image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1) && + (inputs[0]->attr.size[0] < GPU_TENSOR_MAX_WIDTH && inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH); status = _query_kernel( kernel, inputs, outputs, axis, image_2d ); if ( VSI_SUCCESS == status) { diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c index e6ecaa5..e525f5e 100644 --- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c @@ -34,40 +34,21 @@ #include "vsi_nn_error.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" __BEGIN_DECLS -/* - * Define kernel meta. - */ - typedef enum -{ - LAYERNORM_KERNEL, - LAYERNORM_2D_KERNEL, - SUMSQR_KERNEL, - SUMSQR_2D_KERNEL, - LAYERNORM_WH_KERNEL, - LAYERNORM_WH_2D_KERNEL, -} _kernel_type_e; +#define SOURCE_AXIS0_0 "layer_normalization_0" +#define SOURCE_AXIS0_1 "layer_normalization_1" +#define SOURCE_AXIS0_2 "layer_normalization_2" +#define SOURCE_AXIS0_3 "layer_normalization_3" +#define SOURCE_AXIS01 "layer_normalization_axis01" -#define KERNEL_SOURCE_1 "layer_normalization" -#define KERNEL_SOURCE_2 "layer_normalization_2d" -#define KERNEL_SOURCE_3 "layer_normalization_u8_f16" -#define KERNEL_SOURCE_4 "layer_normalization_wh_u8" -#define KERNEL_SOURCE_5 "layer_normalization_wh_f16" -#define KERNEL_SOURCE_6 "layer_normalization_i16" -#define KERNEL_SOURCE_7 "layer_normalization_wh_i16" -#define KERNEL_SOURCE_8 "layer_normalization_scale_f32" -#define KERNEL_SOURCE_9 "layer_normalization_scale_f32_2d" -#define KERNEL_SOURCE_10 "layer_normalization_scale_f32_bf16" +#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, SCALE_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layer_norm_axis0_"#SRC0_TYPE"_"#SCALE_TYPE"to"#DST_TYPE) -#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE) - -#define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D") +#define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, SCALE_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layer_norm_axis0_"#SRC0_TYPE"_"#SCALE_TYPE"to"#DST_TYPE"_2D") #define HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE) @@ -79,58 +60,43 @@ __BEGIN_DECLS #define HASH_LAYERNORM_KEY(_input0_type, _input2_type, _output_type, _reshape_flag) \ ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | _reshape_flag) -#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, LAYERNORM_KERNEL), \ - HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ +#define LAYERNORM_KERNELS_3D(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, 0), \ + HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, SCALE_TYPE, OUT_TYPE), \ SOURCE }, -#define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, LAYERNORM_2D_KERNEL), \ - HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ +#define LAYERNORM_KERNELS_2D(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, 1), \ + HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, SCALE_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_LAYERNORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_KERNEL), \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, 0), \ HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, #define TENSOR_LAYERNORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_2D_KERNEL), \ + { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, 1), \ HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, -// greater than max size -#define HASH_SUMSQR_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE) +// layer norm on aix 0 and 1 -#define HASH_SUMSQR_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE"_2D") +#define HASH_LN_AXIS01_SUMS_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layernorm_axis01_sums_"#SRC0_TYPE"to"#DST_TYPE) -#define HASH_LAYERNORM_WH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE) +#define HASH_LN_AXIS01_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \ + CVIVANTE_NAMESPACE("evis.layernorm_axis01_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE) -#define HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \ - CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE"_2D") +#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, U4, OUT_TYPE, 0), \ + HASH_LN_AXIS01_SUMS_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ + SOURCE_AXIS01 }, -#define TENSOR_SUMSQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_KERNEL), \ - HASH_SUMSQR_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ - SOURCE }, - -#define TENSOR_SUMSQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_2D_KERNEL), \ - HASH_SUMSQR_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ - SOURCE }, - -#define TENSOR_LAYERNORM_WH_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_KERNEL), \ - HASH_LAYERNORM_WH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ - SOURCE }, - -#define TENSOR_LAYERNORM_WH_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \ - HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ - SOURCE }, +#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + HASH_LN_AXIS01_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \ + SOURCE_AXIS01 }, typedef struct { @@ -142,50 +108,84 @@ typedef struct static const _kernel_map_type _layernorm_kernel_map[] = { // Register kernel here - TENSOR_LAYERNORM_KERNELS( U8, F16, U8, KERNEL_SOURCE_1 ) - TENSOR_LAYERNORM_KERNELS_2D( U8, F16, U8, KERNEL_SOURCE_2 ) - TENSOR_LAYERNORM_KERNELS( U8, F16, F16, KERNEL_SOURCE_3 ) - TENSOR_LAYERNORM_KERNELS_2D( U8, F16, F16, KERNEL_SOURCE_3 ) - TENSOR_LAYERNORM_KERNELS( U8, F32, F16, KERNEL_SOURCE_3 ) - TENSOR_LAYERNORM_KERNELS_2D( U8, F32, F16, KERNEL_SOURCE_3 ) + LAYERNORM_KERNELS_3D( U8, F16, U8, SOURCE_AXIS0_0 ) + LAYERNORM_KERNELS_2D( U8, F16, U8, SOURCE_AXIS0_0 ) + LAYERNORM_KERNELS_3D( U8, F16, F16, SOURCE_AXIS0_0 ) + LAYERNORM_KERNELS_2D( U8, F16, F16, SOURCE_AXIS0_0 ) + LAYERNORM_KERNELS_3D( I8, F16, I8, SOURCE_AXIS0_0 ) + LAYERNORM_KERNELS_2D( I8, F16, I8, SOURCE_AXIS0_0 ) + LAYERNORM_KERNELS_3D( I8, F16, F16, SOURCE_AXIS0_0 ) + LAYERNORM_KERNELS_2D( I8, F16, F16, SOURCE_AXIS0_0 ) - TENSOR_LAYERNORM_KERNELS( F16, F16, F16, KERNEL_SOURCE_1 ) - TENSOR_LAYERNORM_KERNELS_2D( F16, F16, F16, KERNEL_SOURCE_2 ) - TENSOR_LAYERNORM_KERNELS( F16, F16, U8, KERNEL_SOURCE_1 ) - TENSOR_LAYERNORM_KERNELS_2D( F16, F16, U8, KERNEL_SOURCE_2 ) - TENSOR_LAYERNORM_KERNELS( I16, F16, I16, KERNEL_SOURCE_6 ) - TENSOR_LAYERNORM_KERNELS_2D( I16, F16, I16, KERNEL_SOURCE_6 ) + LAYERNORM_KERNELS_3D( F16, F16, F16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( F16, F16, F16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( I16, F16, I16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( I16, F16, I16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( F16, F16, I16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( F16, F16, I16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( F16, F16, I8, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( F16, F16, I8, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( F16, F16, U8, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( F16, F16, U8, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( I16, F16, F16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( I16, F16, F16, SOURCE_AXIS0_1 ) - TENSOR_LAYERNORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_8 ) - TENSOR_LAYERNORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_9 ) - TENSOR_LAYERNORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_8 ) - TENSOR_LAYERNORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_9 ) - TENSOR_LAYERNORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_8 ) - TENSOR_LAYERNORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_9 ) - TENSOR_LAYERNORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_8 ) - TENSOR_LAYERNORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_9 ) - TENSOR_LAYERNORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_10 ) - TENSOR_LAYERNORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_10 ) + LAYERNORM_KERNELS_3D( F16, F32, F16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( F16, F32, F16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( I16, F32, I16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( I16, F32, I16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( F16, F32, I16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( F16, F32, I16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( F16, F32, I8, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( F16, F32, I8, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( F16, F32, U8, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( F16, F32, U8, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_3D( I16, F32, F16, SOURCE_AXIS0_1 ) + LAYERNORM_KERNELS_2D( I16, F32, F16, SOURCE_AXIS0_1 ) + + LAYERNORM_KERNELS_3D( U8, F32, U8, SOURCE_AXIS0_2 ) + LAYERNORM_KERNELS_2D( U8, F32, U8, SOURCE_AXIS0_2 ) + LAYERNORM_KERNELS_3D( U8, F32, F16, SOURCE_AXIS0_2 ) + LAYERNORM_KERNELS_2D( U8, F32, F16, SOURCE_AXIS0_2 ) + LAYERNORM_KERNELS_3D( I8, F32, I8, SOURCE_AXIS0_2 ) + LAYERNORM_KERNELS_2D( I8, F32, I8, SOURCE_AXIS0_2 ) + LAYERNORM_KERNELS_3D( I8, F32, F16, SOURCE_AXIS0_2 ) + LAYERNORM_KERNELS_2D( I8, F32, F16, SOURCE_AXIS0_2 ) + + LAYERNORM_KERNELS_3D( BF16, F32, BF16, SOURCE_AXIS0_3 ) + LAYERNORM_KERNELS_2D( BF16, F32, BF16, SOURCE_AXIS0_3 ) }; -static const _kernel_map_type _sumsqr_kernel_map[] = +static const _kernel_map_type _layernorm_axis01_kernel_map[] = { // Register kernel here - TENSOR_SUMSQR_KERNELS( U8, F32, KERNEL_SOURCE_4 ) - TENSOR_SUMSQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_4 ) - TENSOR_SUMSQR_KERNELS( F16, F32, KERNEL_SOURCE_5 ) - TENSOR_SUMSQR_KERNELS_2D( F16, F32, KERNEL_SOURCE_5 ) - TENSOR_SUMSQR_KERNELS( I16, F32, KERNEL_SOURCE_7 ) - TENSOR_SUMSQR_KERNELS_2D( I16, F32, KERNEL_SOURCE_7 ) + LN_AXIS01_SUMS_KERNELS( I8, F32 ) + LN_AXIS01_SUMS_KERNELS( U8, F32 ) + LN_AXIS01_SUMS_KERNELS( F16, F32 ) + LN_AXIS01_SUMS_KERNELS( I16, F32 ) + + LAYERNORM_AXIS01_KERNELS( U8, F16, U8 ) + LAYERNORM_AXIS01_KERNELS( U8, F16, F16 ) + LAYERNORM_AXIS01_KERNELS( I8, F16, I8 ) + LAYERNORM_AXIS01_KERNELS( I8, F16, F16 ) + LAYERNORM_AXIS01_KERNELS( F16, F16, F16 ) + LAYERNORM_AXIS01_KERNELS( F16, F16, I16 ) + LAYERNORM_AXIS01_KERNELS( F16, F16, I8 ) + LAYERNORM_AXIS01_KERNELS( F16, F16, U8 ) + LAYERNORM_AXIS01_KERNELS( I16, F16, I16 ) + LAYERNORM_AXIS01_KERNELS( I16, F16, F16 ) + + LAYERNORM_AXIS01_KERNELS( U8, F32, U8 ) + LAYERNORM_AXIS01_KERNELS( U8, F32, F16 ) + LAYERNORM_AXIS01_KERNELS( I8, F32, I8 ) + LAYERNORM_AXIS01_KERNELS( I8, F32, F16 ) + LAYERNORM_AXIS01_KERNELS( F16, F32, F16 ) + LAYERNORM_AXIS01_KERNELS( F16, F32, I16 ) + LAYERNORM_AXIS01_KERNELS( F16, F32, I8 ) + LAYERNORM_AXIS01_KERNELS( F16, F32, U8 ) + LAYERNORM_AXIS01_KERNELS( I16, F32, I16 ) + LAYERNORM_AXIS01_KERNELS( I16, F32, F16 ) - TENSOR_LAYERNORM_WH_KERNELS( U8, U8, KERNEL_SOURCE_4 ) - TENSOR_LAYERNORM_WH_KERNELS_2D( U8, U8, KERNEL_SOURCE_4 ) - TENSOR_LAYERNORM_WH_KERNELS( U8, F16, KERNEL_SOURCE_4 ) - TENSOR_LAYERNORM_WH_KERNELS_2D( U8, F16, KERNEL_SOURCE_4 ) - TENSOR_LAYERNORM_WH_KERNELS( F16, F16, KERNEL_SOURCE_5 ) - TENSOR_LAYERNORM_WH_KERNELS_2D( F16, F16, KERNEL_SOURCE_5 ) - TENSOR_LAYERNORM_WH_KERNELS( I16, I16, KERNEL_SOURCE_7 ) - TENSOR_LAYERNORM_WH_KERNELS_2D( I16, I16, KERNEL_SOURCE_7 ) }; /* @@ -202,14 +202,14 @@ static vx_param_description_t _layernorm_kernel_param_def[] = // Add kererl parameters here }; -static vx_param_description_t _sumSqr_kernel_param_def[] = +static vx_param_description_t _layernorm_axis01_sums_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, // Add kererl parameters here }; -static vx_param_description_t _layernorm_wh_kernel_param_def[] = +static vx_param_description_t _layernorm_axis01_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -221,8 +221,8 @@ static vx_param_description_t _layernorm_wh_kernel_param_def[] = }; #define _LAYERNORM_PARAM_NUM _cnt_of_array( _layernorm_kernel_param_def ) -#define _SUMSQR_PARAM_NUM _cnt_of_array( _sumSqr_kernel_param_def ) -#define _LAYERNORM_WH_PARAM_NUM _cnt_of_array( _layernorm_wh_kernel_param_def ) +#define _LAYERNORM_SUMS_PARAM_NUM _cnt_of_array( _layernorm_axis01_sums_param_def ) +#define _LAYERNORM_AXIS01_PARAM_NUM _cnt_of_array( _layernorm_axis01_kernel_param_def ) /* * Kernel initializer @@ -245,15 +245,9 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; vsi_size_array_t * input_shape = NULL; - float scaleIn = 1; - float scaleOut = 1; + float output_scale = 1; float output_zp = 0; - int32_t input_zp = 0; - int32_t iter = 0; - int32_t sumInZp = 0; - int32_t tmpZp1 = 0; - int32_t tmpZp2 = 0; - float e2InScale = 0; + float inv_multiplier = 0; int32_t height = 0, width = 0, chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -265,61 +259,14 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) input_shape = attr[0]->shape; - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - input_zp = 0; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE) - { - scaleIn = 1; - input_zp = 0; - } - - if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - output_zp = (float)attr[2]->asymm.zero_point; - scaleOut = 1.0f / attr[2]->asymm.scale; - } - if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[2]->dfp.fl > 0) - { - scaleOut = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - scaleOut = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - output_zp = 0; - } - else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE) - { - scaleOut = 1; - output_zp = 0.0f; - } + output_scale = 1.0f / attr[2]->scale; + output_zp = (float)attr[2]->zero_point; width = (int32_t)(input_shape->data[0]); height = (int32_t)(input_shape->data[1]); chn = (int32_t)((input_shape->size <= 2) ? 1 : input_shape->data[2]); - iter = ((width + 15) / 16) * 16; - sumInZp = input_zp * iter * (-1); - tmpZp1 = (-2) * input_zp; - tmpZp2 = iter * input_zp * input_zp; - e2InScale = scaleIn * scaleIn; + inv_multiplier = 1.0f / (float)width; shaderParam.global_scale[0] = width; shaderParam.global_scale[1] = 1; @@ -332,125 +279,95 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError); { - float dimRatio = 1.0f / (float)width; - float dimRatio_scale = dimRatio * scaleIn; - gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0x76543210, // ABin - 0x5555aaaa, // BSelt - 0x00000000, 0x76543210, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + gpu_dp_inst_t uniDataToFP32_0_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00010000, 0x00030002, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniExtractHalf4_dp4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00020000, 0x00060004, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{ + gpu_dp_inst_t uniDataToFP32_1_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00050004, 0x00070006, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniSumU8_16x1 = {{ - 0x55555555, // TCfg + gpu_dp_inst_t uniDataToFP32_2_4x4 = {{ + 0x01010101, // TCfg 0x00000000, // ASelt - 0x76543210, 0xfedcba98, // ABin - 0xaaaaaaaa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniSqrSum_16x1 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0xfedcba98, // ABin - 0x55555555, // BSelt - 0x76543210, 0xfedcba98, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00050004, 0x00070006, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt 0x00090008, 0x000b000a, // ABin - 0x0a0a0a0a, // BSelt + 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt + gpu_dp_inst_t uniDataToFP32_3_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt 0x000d000c, 0x000f000e, // ABin - 0x0a0a0a0a, // BSelt + 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ 0x33333333, // TCfg 0x11110000, // ASelt 0x03020100, 0x03020100, // ABin 0x00000000, // BSelt 0x00000000, 0x00000000, // BBin 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t UniPackFP16even_2x8 = {{ - 0x11111111, // TCfg - 0x11110000, // ASelt - 0x06040200, 0x06040200, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + gpu_dp_inst_t uniSumX_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0xaaaaaaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00010001, 0x00010001, 0x00010001, 0x00010001, + 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ + gpu_dp_inst_t uniSumX2_16x1 = {{ + 0x55555555, // TCfg + 0x00000000, // ASelt + 0x76543210, 0xfedcba98, // ABin + 0x55555555, // BSelt + 0x76543210, 0xfedcba98, // BBin + 0x00000400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniSum_X_X2_8x2 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0x76543210, // ABin - 0x5555aaaa, // BSelt + 0x0000aaaa, // BSelt 0x00000000, 0x76543210, // BBin - 0x00000300, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + 0x00000100, // AccumType, ConstantType, and PostShift + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ @@ -491,143 +408,75 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[2]->dtype ); status = vsi_nn_kernel_gpu_add_param(node, "width", &width); - status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); + status |= vsi_nn_kernel_gpu_add_param(node, "inv_multiplier", &inv_multiplier); CHECK_STATUS_FAIL_GOTO(status, OnError ); switch( pack_key ) { case _PACK_SELECT_KEY( U8, F16, F16 ): case _PACK_SELECT_KEY( U8, F32, F16 ): + case _PACK_SELECT_KEY( I8, F16, F16 ): + case _PACK_SELECT_KEY( I8, F32, F16 ): + case _PACK_SELECT_KEY( U8, F16, U8 ): + case _PACK_SELECT_KEY( U8, F32, U8 ): + case _PACK_SELECT_KEY( I8, F16, I8 ): + case _PACK_SELECT_KEY( I8, F32, I8 ): { - status = vsi_nn_kernel_gpu_add_param(node, "UniPackFP16even_2x8", - &UniPackFP16even_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", - &uniConvert3rdUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", - &uniConvert4thUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); - status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); - status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2); + if (attr[2]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractInteger_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); + } + status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX_16x1", &uniSumX_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX2_16x1", &uniSumX2_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", + &uniDataToFP32_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", + &uniDataToFP32_1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_2_4x4", + &uniDataToFP32_2_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_3_4x4", + &uniDataToFP32_3_4x4); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; - case _PACK_SELECT_KEY( U8, F16, U8 ): + case _PACK_SELECT_KEY( I16, F16, F16 ): + case _PACK_SELECT_KEY( I16, F32, F16 ): case _PACK_SELECT_KEY( F16, F16, F16 ): - case _PACK_SELECT_KEY( F16, F16, U8 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", - &uniFp16SumSqr_dp8x2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4", - &uniExtractHalf4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", - &uniConvert3rdUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", - &uniConvert4thUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", - &uniConvertSecFp16Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); - status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); - status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2); - status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( I16, F16, I16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", - &uniInt16SumSqr_dp8x2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", - &uniConvertSecFp16Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); - status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( U8, F32, U8 ): case _PACK_SELECT_KEY( F16, F32, F16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", - &uniFp16SumSqr_dp8x2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4", - &uniExtractHalf4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4", - &uniConvert3rdUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", - &uniConvert4thUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); - status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); - status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2); - status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; + case _PACK_SELECT_KEY( I16, F16, I16 ): case _PACK_SELECT_KEY( I16, F32, I16 ): + case _PACK_SELECT_KEY( F16, F16, I16 ): + case _PACK_SELECT_KEY( F16, F32, I16 ): + case _PACK_SELECT_KEY( F16, F16, U8 ): + case _PACK_SELECT_KEY( F16, F32, U8 ): + case _PACK_SELECT_KEY( F16, F16, I8 ): + case _PACK_SELECT_KEY( F16, F32, I8 ): { - status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", - &uniInt16SumSqr_dp8x2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", - &uniConvertInt32toUint8_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); + if (attr[2]->dtype == F16) + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractHalf8_2x8); + } + else + { + status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", + &uniExtractInteger_2x8); + } + status |= vsi_nn_kernel_gpu_add_param(node, "uniSum_X_X2_8x2", &uniSum_X_X2_8x2); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", + &uniDataToFP32_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", + &uniDataToFP32_1_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); - status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; @@ -669,7 +518,7 @@ OnError: return status; } -DEF_KERNEL_INITIALIZER(_sumsqr_initializer) +DEF_KERNEL_INITIALIZER(_layernorm_axis01_sums_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -686,14 +535,6 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer) vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; vsi_size_array_t * input_shape = NULL; - float scaleIn = 1.0f; - int32_t input_zp = 0; - vx_uint32 iter = 0; - int32_t sumInZp = 0; - int32_t tmpZp1 = 0; - float tmpZp2 = 0; - float e2InScale = 0; - float rowSumScale = 0; int32_t width = 0; int32_t height = 0; int32_t chn = 0; @@ -705,37 +546,9 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer) input_shape = attr[0]->shape; - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - input_zp = 0; - } - width = (int32_t)(input_shape->data[0]); height = (int32_t)(input_shape->data[1]); chn = (int32_t)(attr[1]->shape->data[1]); - iter = height * 16; - - e2InScale = scaleIn * scaleIn; - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - sumInZp = input_zp * iter * (-1); - tmpZp1 = (-2) * input_zp; - tmpZp2 = input_zp * input_zp * e2InScale; - rowSumScale = height * 16 * tmpZp2; - } shaderParam.global_scale[0] = 1; shaderParam.global_scale[1] = 1; @@ -758,9 +571,9 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer) status = vsi_nn_kernel_gpu_config( node, &shaderParam ); CHECK_STATUS_FAIL_GOTO(status, OnError); - if (attr[0]->dtype == U8) + if (attr[0]->dtype == U8 || attr[0]->dtype == I8) { - gpu_dp_inst_t uniSumU8_16x1 = {{ + gpu_dp_inst_t uniSumX_16x1 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0xfedcba98, // ABin @@ -769,7 +582,7 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer) 0x00002400, // AccumType, ConstantType, and PostShift 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniSqrSum_16x1 = {{ + gpu_dp_inst_t uniSumX2_16x1 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0xfedcba98, // ABin @@ -778,43 +591,23 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer) 0x00000400, // AccumType, ConstantType, and PostShift 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1); - status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp); - status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); - status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale); + status = vsi_nn_kernel_gpu_add_param(node, "uniSumX_16x1", &uniSumX_16x1); + status |= vsi_nn_kernel_gpu_add_param(node, "uniSumX2_16x1", &uniSumX2_16x1); CHECK_STATUS_FAIL_GOTO(status, OnError ); } - else if (attr[0]->dtype == F16) + else if (attr[0]->dtype == I16 || attr[0]->dtype == F16) { - gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{ + gpu_dp_inst_t uniSum_X_X2_8x2 = {{ 0x55555555, // TCfg 0x00000000, // ASelt 0x76543210, 0x76543210, // ABin - 0x5555aaaa, // BSelt + 0x0000aaaa, // BSelt 0x00000000, 0x76543210, // BBin 0x00000100, // AccumType, ConstantType, and PostShift - 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - else if (attr[0]->dtype == I16) - { - gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{ - 0x55555555, // TCfg - 0x00000000, // ASelt - 0x76543210, 0x76543210, // ABin - 0x5555aaaa, // BSelt - 0x00000000, 0x76543210, // BBin - 0x00000300, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); + status = vsi_nn_kernel_gpu_add_param(node, "uniSum_X_X2_8x2", &uniSum_X_X2_8x2); CHECK_STATUS_FAIL_GOTO(status, OnError ); } @@ -837,7 +630,7 @@ OnError: return status; } -DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer) +DEF_KERNEL_INITIALIZER(_layernorm_axis01_initializer) ( vsi_nn_kernel_node_t node, const vsi_nn_kernel_node_param_t * param, @@ -854,13 +647,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer) vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; vsi_size_array_t * input_shape = NULL; - float scaleIn = 1.0f; - float scaleOut = 1.0f; + float output_scale = 1.0f; float output_zp = 0; - int32_t input_zp = 0; - float dimRatio = 0; + float inv_multiplier = 0; vx_uint32 group_num = 0; - vx_int32 height = 0, width = 0, chn = 0, height_chn_org = 0; + vx_int32 height = 0, width = 0, chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -870,49 +661,14 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer) CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError ); input_shape = attr[0]->shape; - - if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - input_zp = attr[0]->asymm.zero_point; - scaleIn = attr[0]->asymm.scale; - } - else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[0]->dfp.fl > 0) - { - scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl))); - } - else - { - scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl)); - } - input_zp = 0; - } - - if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM) - { - output_zp = (float)attr[2]->asymm.zero_point; - scaleOut = 1.0f / attr[2]->asymm.scale; - } - else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP) - { - if (attr[2]->dfp.fl > 0) - { - scaleOut = (float)((int64_t)1 << attr[2]->dfp.fl); - } - else - { - scaleOut = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl)); - } - output_zp = 0; - } + output_scale = 1.0f / attr[2]->scale; + output_zp = (float)attr[2]->zero_point; width = (int32_t)(input_shape->data[0]); height = (int32_t)(input_shape->data[1]); chn = (int32_t)(attr[1]->shape->data[1]); - height_chn_org = (int32_t)((input_shape->size > 2 ? input_shape->data[2] : 1) / chn); - dimRatio = (float)(1.0 / (width * height)); + inv_multiplier = (float)(1.0 / (width * height)); group_num = (width + 255) / 256; if (attr[0]->dtype == I16 || attr[0]->dtype == F16) @@ -933,25 +689,37 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError); { - gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{ + gpu_dp_inst_t uniDataToFP32_0_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00010000, 0x00030002, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{ + gpu_dp_inst_t uniDataToFP32_1_4x4 = {{ 0x01010101, // TCfg 0x00000000, // ASelt 0x00050004, 0x00070006, // ABin 0x02020202, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + 0x00000300, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + gpu_dp_inst_t uniExtractHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractInteger_2x8 = {{ 0x33333333, // TCfg 0x11110000, // ASelt 0x03020100, 0x03020100, // ABin @@ -961,91 +729,26 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{ - 0x05050505, // TCfg - 0x04040404, // ASelt - 0x00050004, 0x00070006, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{ - 0x11111111, // TCfg - 0x11110000, // ASelt - 0x06040200, 0x06040200, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant - }, GPU_DP_TYPE_16 }; - uint32_t pack_key = 0; -#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \ - (IN0_TYPE | (OUT_TYPE << 8)) - - pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype ); status = vsi_nn_kernel_gpu_add_param(node, "height", &height); - status |= vsi_nn_kernel_gpu_add_param(node, "height_depth", &height_chn_org); - status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); + status |= vsi_nn_kernel_gpu_add_param(node, "inv_multiplier", &inv_multiplier); status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4", &uniDataToFP32_0_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4", &uniDataToFP32_1_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - - switch( pack_key ) + status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale); + if (attr[2]->dtype == F16) { - case _PACK_SELECT_KEY( U8, U8 ): - case _PACK_SELECT_KEY( U8, F16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( F16, F16 ): - case _PACK_SELECT_KEY( F16, U8 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( I16, I16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4", - &uniConvert1stUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4", - &uniConvert2ndUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); - status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - default: - VSI_ASSERT( FALSE ); - return VSI_FAILURE; + status |= vsi_nn_kernel_gpu_add_param( + node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8); } -#undef _PACK_SELECT_KEY + else + { + status |= vsi_nn_kernel_gpu_add_param( + node, "uniExtract8Data_2x8", &uniExtractInteger_2x8); + } + CHECK_STATUS_FAIL_GOTO(status, OnError ); } OnError: @@ -1076,7 +779,7 @@ static vsi_status _query_kernel vsi_nn_tensor_t* const* const inputs, vsi_nn_tensor_t* const* const outputs, vsi_nn_kernel_t* kernel, - int32_t reshape2D + int32_t is_img2d_input ) { vsi_status status = VSI_FAILURE; @@ -1084,19 +787,13 @@ static vsi_status _query_kernel vsi_nn_kernel_dtype_e input2_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = U8; uint32_t key = 0; - int i = 0; - _kernel_type_e kernel_type = LAYERNORM_KERNEL; - - if (reshape2D) - { - kernel_type = LAYERNORM_2D_KERNEL; - } + int32_t i = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, kernel_type ); + key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, is_img2d_input ); for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ ) { @@ -1122,14 +819,12 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ -static vsi_status _query_kernel_wh +static vsi_status _query_kernel_axis01 ( vsi_nn_tensor_t* const* const inputs, vsi_nn_tensor_t* const* const outputs, - vsi_nn_kernel_t* kernel_sumSqr, - vsi_nn_kernel_t* kernel, - _kernel_type_e is2D_sumsqr, - _kernel_type_e is2D_wh + vsi_nn_kernel_t* kernel_sums, + vsi_nn_kernel_t* kernel ) { vsi_status status = VSI_FAILURE; @@ -1143,56 +838,56 @@ static vsi_status _query_kernel_wh input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, F32, is2D_sumsqr ); + key = HASH_LAYERNORM_KEY( input0_dtype, U4, F32, 0 ); - for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ ) + for( i = 0; i < _cnt_of_array(_layernorm_axis01_kernel_map); i ++ ) { - if ( _sumsqr_kernel_map[i].key == key ) + if ( _layernorm_axis01_kernel_map[i].key == key ) { break; } } - if ( i < _cnt_of_array(_sumsqr_kernel_map) ) + if ( i < _cnt_of_array(_layernorm_axis01_kernel_map) ) { - snprintf( kernel_sumSqr->info.name, VX_MAX_KERNEL_NAME, "%s", _sumsqr_kernel_map[i].function_name ); - kernel_sumSqr->info.parameters = _sumSqr_kernel_param_def; - kernel_sumSqr->info.numParams = _SUMSQR_PARAM_NUM; - kernel_sumSqr->info.initialize = _sumsqr_initializer; + snprintf( kernel_sums->info.name, VX_MAX_KERNEL_NAME, "%s", _layernorm_axis01_kernel_map[i].function_name ); + kernel_sums->info.parameters = _layernorm_axis01_sums_param_def; + kernel_sums->info.numParams = _LAYERNORM_SUMS_PARAM_NUM; + kernel_sums->info.initialize = _layernorm_axis01_sums_initializer; - vsi_nn_kernel_add_source( kernel_sumSqr, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + vsi_nn_kernel_add_source( kernel_sums, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", - _sumsqr_kernel_map[i].source_name ); - vsi_nn_kernel_add_source( kernel_sumSqr, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - _sumsqr_kernel_map[i].source_name ); + _layernorm_axis01_kernel_map[i].source_name ); + vsi_nn_kernel_add_source( kernel_sums, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + _layernorm_axis01_kernel_map[i].source_name ); } - key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, is2D_wh ); + key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, 0 ); - for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ ) + for ( i = 0; i < _cnt_of_array(_layernorm_axis01_kernel_map); i ++ ) { - if ( _sumsqr_kernel_map[i].key == key ) + if ( _layernorm_axis01_kernel_map[i].key == key ) { break; } } - if ( i < _cnt_of_array(_sumsqr_kernel_map) ) + if ( i < _cnt_of_array(_layernorm_axis01_kernel_map) ) { - snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _sumsqr_kernel_map[i].function_name ); - kernel->info.parameters = _layernorm_wh_kernel_param_def; - kernel->info.numParams = _LAYERNORM_WH_PARAM_NUM; - kernel->info.initialize = _layernorm_wh_initializer; + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _layernorm_axis01_kernel_map[i].function_name ); + kernel->info.parameters = _layernorm_axis01_kernel_param_def; + kernel->info.numParams = _LAYERNORM_AXIS01_PARAM_NUM; + kernel->info.initialize = _layernorm_axis01_initializer; vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, "vsi_nn_kernel_header", - _sumsqr_kernel_map[i].source_name ); + _layernorm_axis01_kernel_map[i].source_name ); vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, - _sumsqr_kernel_map[i].source_name ); + _layernorm_axis01_kernel_map[i].source_name ); status = VSI_SUCCESS; } return status; -} /* _query_kernel_wh() */ +} /* _query_kernel_axis01() */ -static vsi_nn_kernel_node_t _setup_wh +static vsi_nn_kernel_node_t _setup_axis01 ( vsi_nn_graph_t * graph, vsi_nn_tensor_t ** inputs, @@ -1205,30 +900,22 @@ static vsi_nn_kernel_node_t _setup_wh { vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; - vsi_nn_kernel_node_param_t sumSqr_node_params[_SUMSQR_PARAM_NUM] = { NULL }; - vsi_nn_kernel_node_param_t node_params[_LAYERNORM_WH_PARAM_NUM] = { NULL }; - vsi_nn_kernel_node_t tmp_node = NULL; + vsi_nn_kernel_node_param_t sums_node_params[_LAYERNORM_SUMS_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_param_t node_params[_LAYERNORM_AXIS01_PARAM_NUM] = { NULL }; + vsi_nn_kernel_node_t sums_node = NULL; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_attr_t attr; - _kernel_type_e is2D_sumsqr = SUMSQR_2D_KERNEL; - _kernel_type_e is2D_wh = LAYERNORM_WH_2D_KERNEL; - vsi_nn_kernel_t * kernel_sumSqr = NULL; - vsi_nn_tensor_t * tensor_sumSqr = NULL; - float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); - + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + vsi_nn_kernel_t * kernel_sums = NULL; + vsi_nn_tensor_t * tensor_sums = NULL; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ) / + (input_scale * input_scale); int32_t axis[VSI_NN_MAX_DIM_NUM] = {0}; int32_t axis_num = 1; int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }}; uint32_t axis_size = 0; uint32_t rank_in = 0, rank_para = 0; - vsi_size_t outer_size = 1; - uint32_t i = 0; - - for(i = 1; i < inputs[0]->attr.dim_num; i++) - { - outer_size *= inputs[0]->attr.size[i]; - } status = vsi_nn_kernel_optimize_tensor_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num, @@ -1254,15 +941,9 @@ static vsi_nn_kernel_node_t _setup_wh rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[0], rank_in); - if (rank_in > 2) - { - is2D_sumsqr = SUMSQR_KERNEL; - is2D_wh = LAYERNORM_WH_KERNEL; - } - - kernel_sumSqr = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + kernel_sums = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); // Assign unique_id - kernel_sumSqr->unique_id = kernel->unique_id; + kernel_sums->unique_id = kernel->unique_id; memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) ); attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; @@ -1275,76 +956,59 @@ static vsi_nn_kernel_node_t _setup_wh { attr.size[0] = ((new_shape[0][0] + 127) / 128) * 4; } - attr.size[1] = outer_size; + attr.size[1] = new_shape[0][2]; attr.size[2] = 1; - attr.size[3] = 1; - attr.dim_num = 4; - tensor_sumSqr = vsi_nn_CreateTensor( graph, &attr ); + attr.size[3] = new_shape[0][3]; + attr.dim_num = rank_in; + tensor_sums = vsi_nn_CreateTensor( graph, &attr ); - status = _query_kernel_wh(inputs, outputs, kernel_sumSqr, kernel, is2D_sumsqr, is2D_wh); + status = _query_kernel_axis01(inputs, outputs, kernel_sums, kernel); if ( VSI_SUCCESS != status ) { goto final; } + /* + ** sum(x) and sumsq(x*x) + */ + sums_node = vsi_nn_kernel_create_node(graph, kernel_sums); + if (sums_node) { - tmp_node = vsi_nn_kernel_create_node( graph, kernel_sumSqr ); - if (tmp_node) - { - sumSqr_node_params[0] = rs_input; - sumSqr_node_params[1] = (vsi_nn_kernel_node_param_t)tensor_sumSqr->t; + sums_node_params[0] = rs_input; + sums_node_params[1] = (vsi_nn_kernel_node_param_t)tensor_sums->t; - status = vsi_nn_kernel_node_pass_param( tmp_node, sumSqr_node_params, - _SUMSQR_PARAM_NUM ); + status = vsi_nn_kernel_node_pass_param( + sums_node, sums_node_params, _LAYERNORM_SUMS_PARAM_NUM); + CHECK_STATUS(status); + { + // Set default border mode. + vx_border_t border; + border.mode = VX_BORDER_CONSTANT; + border.constant_value.U16 = 0; + status = vxSetNodeAttribute( + (vx_node)sums_node, VX_NODE_BORDER, &border, sizeof(border)); CHECK_STATUS(status); - { - // Set default border mode. - vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U8 = 0; - border.constant_value.U16 = 0; - if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) - { - border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); - } - status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) ); - CHECK_STATUS(status); - } } } + node = vsi_nn_kernel_create_node( graph, kernel ); + if (node) { - node = vsi_nn_kernel_create_node( graph, kernel ); - if (node) - { - uint32_t index = 0; - node_params[index++] = rs_input; - node_params[index++] = rs_beta; - node_params[index++] = rs_gamma; - node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_sumSqr->t; - node_params[index++] = rs_output; - node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + uint32_t index = 0; + node_params[index++] = rs_input; + node_params[index++] = rs_beta; + node_params[index++] = rs_gamma; + node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_sums->t; + node_params[index++] = rs_output; + node_params[index++] = vsi_nn_kernel_scalar_create(graph, F32, &eps); - status = vsi_nn_kernel_node_pass_param( node, node_params, - _LAYERNORM_WH_PARAM_NUM ); - CHECK_STATUS(status); - vsi_nn_kernel_scalar_release( &node_params[5] ); - { - // Set default border mode. - vx_border_t border; - border.mode = VX_BORDER_CONSTANT; - border.constant_value.U8 = 0; - border.constant_value.U16 = 0; - if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) - { - border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); - } - status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); - CHECK_STATUS(status); - } - } + status = vsi_nn_kernel_node_pass_param( + node, node_params, _LAYERNORM_AXIS01_PARAM_NUM); + CHECK_STATUS(status); + vsi_nn_kernel_scalar_release(&node_params[5]); } + final: if (rs_beta) { @@ -1362,20 +1026,20 @@ final: { vsi_nn_kernel_tensor_release( &rs_output ); } - if ( kernel_sumSqr ) + if ( kernel_sums ) { - vsi_nn_kernel_release( &kernel_sumSqr ); + vsi_nn_kernel_release( &kernel_sums ); } - if ( tensor_sumSqr ) + if ( tensor_sums ) { - vsi_nn_ReleaseTensor( &tensor_sumSqr ); + vsi_nn_ReleaseTensor( &tensor_sums ); } - if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );} + if (sums_node) {vsi_nn_kernel_node_release( &sums_node );} return node; } -static vsi_nn_kernel_node_t _setup +static vsi_nn_kernel_node_t _setup_axis0 ( vsi_nn_graph_t * graph, vsi_nn_tensor_t ** inputs, @@ -1389,104 +1053,48 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; - float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); - vsi_size_t *input_size = inputs[0]->attr.size; - uint32_t dims_num = inputs[0]->attr.dim_num; - int32_t rs_flg = 0; - int32_t optFlg = 0; + float input_scale = vsi_nn_get_tensor_scale(inputs[0]); + vsi_nn_tensor_t* rs_tensors[4] = { NULL }; + float eps = vsi_nn_kernel_param_get_float32( params, "eps" ) / + (input_scale * input_scale); + int32_t axis[VSI_NN_MAX_DIM_NUM] = {0}; + int32_t axis_num = 1; + int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }}; + uint32_t axis_size = 0; + uint32_t rank_in = 0; + int32_t is_img2d_input = 0; - if (input_size[0] >= GPU_TENSOR_MAX_WIDTH) + status = vsi_nn_kernel_optimize_tensor_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size); + if ( status == FALSE) { - node = _setup_wh(graph, inputs, input_num, outputs, output_num, params, kernel); - goto final; + return NULL; } - if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH) - && dims_num > 2) - { - rs_flg = 1; - } - optFlg = rs_flg || (outputs[0]->attr.dim_num < 3); + is_img2d_input = rank_in < 3 || (new_shape[0][2] == 1); - status = _query_kernel( inputs, outputs, kernel, optFlg); + status = _query_kernel( inputs, outputs, kernel, is_img2d_input); if (VSI_SUCCESS != status) { goto final; } - if (rs_flg) - { - vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; - shape[0] = inputs[0]->attr.size[0]; - shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2]; - shape[2] = 1; - shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 ); - - shape[0] = outputs[0]->attr.size[0]; - shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2]; - shape[2] = 1; - shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; - rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 ); - } - if (inputs[1]->attr.dim_num < 2) - { - vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; - shape[0] = inputs[1]->attr.size[0]; - shape[1] = 1; - shape[2] = 1; - shape[3] = 1; - rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 4 ); - } - if (inputs[2]->attr.dim_num < 2) - { - vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; - shape[0] = inputs[2]->attr.size[0]; - shape[1] = 1; - shape[2] = 1; - shape[3] = 1; - rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 4 ); - } + new_shape[1][0] = new_shape[0][0]; + new_shape[1][1] = 1; + rs_tensors[0] = vsi_nn_reshape_tensor(graph, inputs[0], new_shape[0], rank_in); + rs_tensors[1] = vsi_nn_reshape_tensor(graph, inputs[1], new_shape[1], 2); + rs_tensors[2] = vsi_nn_reshape_tensor(graph, inputs[2], new_shape[1], 2); + rs_tensors[3] = vsi_nn_reshape_tensor(graph, outputs[0], new_shape[0], rank_in); // Nomalization node = vsi_nn_kernel_create_node( graph, kernel ); if (node) { - uint32_t index = 0; - if (rs_flg) - { - node_params[index++] = rs_input; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t; - } - if (inputs[1]->attr.dim_num < 2) - { - node_params[index++] = rs_beta; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t; - } - if (inputs[2]->attr.dim_num < 2) - { - node_params[index++] = rs_gamma; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t; - } - if (rs_flg) - { - node_params[index++] = rs_output; - } - else - { - node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t; - } - node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); + vsi_nn_kernel_node_pack_io(node_params, _LAYERNORM_PARAM_NUM, + rs_tensors, 3, &rs_tensors[3], 1); + node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &eps ); status = vsi_nn_kernel_node_pass_param( node, node_params, _LAYERNORM_PARAM_NUM ); @@ -1496,12 +1104,7 @@ static vsi_nn_kernel_node_t _setup // Set default border mode. vx_border_t border; border.mode = VX_BORDER_CONSTANT; - border.constant_value.U8 = 0; border.constant_value.U16 = 0; - if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8) - { - border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]); - } status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) ); CHECK_STATUS(status); } @@ -1509,19 +1112,37 @@ static vsi_nn_kernel_node_t _setup /* Pass parameters to node. */ final: - if (rs_beta) + vsi_safe_release_tensor(rs_tensors[0]); + vsi_safe_release_tensor(rs_tensors[1]); + vsi_safe_release_tensor(rs_tensors[2]); + vsi_safe_release_tensor(rs_tensors[3]); + + return node; +} /* _setup() */ + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_nn_kernel_node_t node = NULL; + vsi_size_t *input_size = inputs[0]->attr.size; + + if (input_size[0] >= GPU_TENSOR_MAX_WIDTH) { - vsi_nn_kernel_tensor_release( &rs_beta ); + node = _setup_axis01(graph, inputs, input_num, outputs, output_num, params, kernel); } - if (rs_gamma) + else { - vsi_nn_kernel_tensor_release( &rs_gamma ); - } - if (rs_flg) - { - vsi_nn_kernel_tensor_release( &rs_input ); - vsi_nn_kernel_tensor_release( &rs_output ); + node = _setup_axis0(graph, inputs, input_num, outputs, output_num, params, kernel); } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c index c03e942..5825491 100644 --- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -910,6 +910,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) status |= vsi_nn_kernel_gpu_add_param( node, "uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 ); status |= vsi_nn_kernel_gpu_add_param( node, "input01Scale", &inScaleMul ); + status |= vsi_nn_kernel_gpu_add_param( node, "mulKIn0In1Zp", &mulKIn0In1Zp ); CHECK_STATUS_FAIL_GOTO(status, OnError ); } break; diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c index 317e8a0..460ad87 100644 --- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c @@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) { - int32_t fl = (uint8_t)attr[2]->dfp.fl; + int32_t fl = attr[2]->dfp.fl; if (fl > 0) { output_scale = (float) ((int64_t)1 << fl); diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c index 30dfc93..11478f5 100644 --- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c @@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) { - int32_t fl = (uint8_t)attr[2]->dfp.fl; + int32_t fl = attr[2]->dfp.fl; if (fl > 0) { output_scale = (float) ((int64_t)1 << fl); diff --git a/src/tim/vx/internal/src/kernel/evis/mod_evis.c b/src/tim/vx/internal/src/kernel/evis/mod_evis.c new file mode 100644 index 0000000..fe7edd7 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/evis/mod_evis.c @@ -0,0 +1,444 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include +#include +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_log.h" +#include "vsi_nn_error.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" + +__BEGIN_DECLS + +#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ + ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) + + #define MOD_KERNEL_SOURCE_NAME "mod" + +#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ + CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \ + MOD_KERNEL_SOURCE_NAME }, + +#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ + { MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ + CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \ + MOD_KERNEL_SOURCE_NAME }, + +typedef struct +{ + uint32_t key; + char * function_name; + const char * source_name; +} _kernel_map_type; + +static const _kernel_map_type _mod_kernel_map[] = +{ + // Register kernel here + MOD_KERNELS( F16, F16, F16 ) + MOD_KERNELS( F16, F16, I16 ) + MOD_KERNELS( F16, F16, I8 ) + MOD_KERNELS( F16, F16, U8 ) + MOD_KERNELS( I16, I16, I16 ) + MOD_KERNELS( I8, I8, I8 ) + MOD_KERNELS( U8, U8, U8 ) + MOD_KERNELS( I16, I16, F16 ) + MOD_KERNELS( I8, I8, F16 ) + MOD_KERNELS( U8, U8, F16 ) + MOD_KERNELS( BF16, BF16, BF16 ) + + MOD_KERNELS_2D( F16, F16, F16 ) + MOD_KERNELS_2D( F16, F16, I16 ) + MOD_KERNELS_2D( F16, F16, I8 ) + MOD_KERNELS_2D( F16, F16, U8 ) + MOD_KERNELS_2D( I16, I16, I16 ) + MOD_KERNELS_2D( I8, I8, I8 ) + MOD_KERNELS_2D( U8, U8, U8 ) + MOD_KERNELS_2D( I16, I16, F16 ) + MOD_KERNELS_2D( I8, I8, F16 ) + MOD_KERNELS_2D( U8, U8, F16 ) + MOD_KERNELS_2D( BF16, BF16, BF16 ) +}; + + +/* + * Kernel params + */ +static vx_param_description_t _mod_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + // Add kererl parameters here +}; +#define _MOD_PARAM_NUM _cnt_of_array( _mod_kernel_param_def ) + +/* + * Kernel initializer + */ +DEF_KERNEL_INITIALIZER(_mod_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vx_status status = VX_FAILURE; + vx_tensor input0 = (vx_tensor)param[0]; + vx_tensor input1 = (vx_tensor)param[1]; + vx_tensor output = (vx_tensor)param[2]; + vsi_nn_kernel_tensor_attr_t *input0_attr = NULL; + vsi_nn_kernel_tensor_attr_t *input1_attr = NULL; + vsi_nn_kernel_tensor_attr_t *output_attr = NULL; + vsi_size_array_t *output_shape = NULL; + vsi_nn_kernel_dtype_e input0_dtype = F16; + int32_t input0_fl = 0; + int32_t input1_fl = 0; + int32_t output_fl = 0; + float inScale0 = 1.0f; + float inScale1 = 1.0f; + float outScale = 1.0f; + float in0Tail = 0; + float in1Tail = 0; + float outZp = 0; + + input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 ); + CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1 ); + CHECK_PTR_FAIL_GOTO( input1_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); + CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); + + output_shape = output_attr->shape; + input0_dtype = input0_attr->dtype; + + gpu_param.dim = output_shape->size < 3 ? 2 : 3; + gpu_param.global_offset[0] = 0; + gpu_param.global_offset[1] = 0; + gpu_param.global_offset[2] = 0; + gpu_param.global_scale[0] = 8; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + + gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) + / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) + / gpu_param.global_scale[1]; + gpu_param.global_size[2] = output_shape->size > 2 ? + (output_shape->data[2] + gpu_param.global_scale[2] - 1) + / gpu_param.global_scale[2] : 1; + + if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP) + { + input0_fl = input0_attr->dfp.fl; + if (input0_fl > 0) + { + inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl); + } + else + { + inScale0 = (float)((int64_t)1 << -input0_fl); + } + } + else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + inScale0 = input0_attr->asymm.scale; + in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point); + } + + if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP) + { + input1_fl = input1_attr->dfp.fl; + if (input1_fl > 0) + { + inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl); + } + else + { + inScale1 = (float)((int64_t)1 << -input1_fl); + } + } + else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + inScale1 = input1_attr->asymm.scale; + in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point); + } + + if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP) + { + output_fl = output_attr->dfp.fl; + if (output_fl > 0) + { + outScale = (float) ((int64_t)1 << output_fl); + } + else + { + outScale = 1.0f / (float)((int64_t)1 << -output_fl); + } + } + else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM) + { + outScale = 1.0f / output_attr->asymm.scale; + outZp = (float)(output_attr->asymm.zero_point); + } + + if (BF16 == input0_dtype) + { + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16}; + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniExtractOddData_2x8", &uniExtractOddData_2x8 ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, + 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, + "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, + "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 ); + status |= vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 ); + status |= vsi_nn_kernel_gpu_add_param( node, "in0Tail", &in0Tail ); + status |= vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 ); + status |= vsi_nn_kernel_gpu_add_param( node, "in1Tail", &in1Tail ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale ); + status |= vsi_nn_kernel_gpu_add_param( node, "out_zp", &outZp ); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); + +final: + if (input0_attr) + { + vsi_nn_kernel_tensor_attr_release(&input0_attr); + } + if (input1_attr) + { + vsi_nn_kernel_tensor_attr_release(&input1_attr); + } + if (output_attr) + { + vsi_nn_kernel_tensor_attr_release(&output_attr); + } + return status; +} /* _mod_initializer() */ + + + +/* + * Query kernel + */ +static vsi_status _query_kernel + ( + vsi_nn_kernel_t * kernel, + vsi_nn_tensor_t * const * const inputs, + vsi_nn_tensor_t * const * const outputs, + vsi_bool image_2d + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_dtype_e in0_dtype; + vsi_nn_kernel_dtype_e in1_dtype; + vsi_nn_kernel_dtype_e out_dtype; + const _kernel_map_type * kernel_map = _mod_kernel_map; + size_t kernel_map_size = _cnt_of_array( _mod_kernel_map ); + vx_param_description_t * param_def = _mod_kernel_param_def; + size_t param_def_size = _cnt_of_array( _mod_kernel_param_def ); + vx_kernel_initialize_f initializer = _mod_initializer; + + uint32_t key = 0; + uint32_t i = 0; + + in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d); + + for (i = 0; i < kernel_map_size; i ++) + { + if (kernel_map[i].key == key) + { + break; + } + } + + if (i < kernel_map_size) + { + snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); + kernel->info.parameters = param_def; + kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.initialize = initializer; + // Register code source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, + "vsi_nn_kernel_header", + kernel_map[i].source_name ); + // Register binary source + vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1, + kernel_map[i].source_name ); + status = VSI_SUCCESS; + } + + return status; +} /* _query_kernel() */ + + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_t node = NULL; + vsi_bool image_2d = FALSE; + int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod"); + + if (!vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num )) + { + return NULL; + } + + image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1); + if (vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == F16 || + vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == F16 || + vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == BF16 || + vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == BF16) + { + isfmod = 1; + } + status = _query_kernel( kernel, inputs, outputs, image_2d); + if (VSI_SUCCESS == status) + { + node = vsi_nn_kernel_create_node( graph, kernel ); + if (node) + { + /* Set inputs and outputs */ + vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM, + inputs, input_num, outputs, output_num ); + node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod ); + /* Pass parameters to node. */ + status = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM ); + VSI_ASSERT( status == VSI_SUCCESS ); + vsi_nn_kernel_scalar_release( &node_params[3] ); + } + } + + return node; +} /* _setup() */ + +__END_DECLS + +REGISTER_BACKEND_EVIS( mod, _setup ) + diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c index 0ffd627..b4d4f21 100644 --- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c @@ -38,69 +38,20 @@ __BEGIN_DECLS -#define VX_KERNEL_NAME_POW_F16F16TOF16 CVIVANTE_NAMESPACE("evis.pow_F16F16toF16") -#define VX_KERNEL_NAME_POW_F16F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toF16_2D") -#define VX_KERNEL_NAME_POW_F16F16TOU8 CVIVANTE_NAMESPACE("evis.pow_F16F16toU8") -#define VX_KERNEL_NAME_POW_F16F16TOU8_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toU8_2D") -#define VX_KERNEL_NAME_POW_F16F16TOI8 CVIVANTE_NAMESPACE("evis.pow_F16F16toI8") -#define VX_KERNEL_NAME_POW_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toI8_2D") -#define VX_KERNEL_NAME_POW_F16F16TOI16 CVIVANTE_NAMESPACE("evis.pow_F16F16toI16") -#define VX_KERNEL_NAME_POW_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toI16_2D") -#define VX_KERNEL_NAME_POW_F16U8TOF16 CVIVANTE_NAMESPACE("evis.pow_F16U8toF16") -#define VX_KERNEL_NAME_POW_F16U8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16U8toF16_2D") -#define VX_KERNEL_NAME_POW_F16I8TOF16 CVIVANTE_NAMESPACE("evis.pow_F16I8toF16") -#define VX_KERNEL_NAME_POW_F16I8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16I8toF16_2D") -#define VX_KERNEL_NAME_POW_F16I16TOF16 CVIVANTE_NAMESPACE("evis.pow_F16I16toF16") -#define VX_KERNEL_NAME_POW_F16I16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16I16toF16_2D") -#define VX_KERNEL_NAME_POW_F16U8TOU8 CVIVANTE_NAMESPACE("evis.pow_F16U8toU8") -#define VX_KERNEL_NAME_POW_F16U8TOU8_2D CVIVANTE_NAMESPACE("evis.pow_F16U8toU8_2D") -#define VX_KERNEL_NAME_POW_F16I8TOI8 CVIVANTE_NAMESPACE("evis.pow_F16I8toI8") -#define VX_KERNEL_NAME_POW_F16I8TOI8_2D CVIVANTE_NAMESPACE("evis.pow_F16I8toI8_2D") -#define VX_KERNEL_NAME_POW_F16I16TOI16 CVIVANTE_NAMESPACE("evis.pow_F16I16toI16") -#define VX_KERNEL_NAME_POW_F16I16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_F16I16toI16_2D") -#define VX_KERNEL_NAME_POW_U8F16TOF16 CVIVANTE_NAMESPACE("evis.pow_U8F16toF16") -#define VX_KERNEL_NAME_POW_U8F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_U8F16toF16_2D") -#define VX_KERNEL_NAME_POW_I8F16TOF16 CVIVANTE_NAMESPACE("evis.pow_I8F16toF16") -#define VX_KERNEL_NAME_POW_I8F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_I8F16toF16_2D") -#define VX_KERNEL_NAME_POW_I16F16TOF16 CVIVANTE_NAMESPACE("evis.pow_I16F16toF16") -#define VX_KERNEL_NAME_POW_I16F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_I16F16toF16_2D") -#define VX_KERNEL_NAME_POW_U8F16TOU8 CVIVANTE_NAMESPACE("evis.pow_U8F16toU8") -#define VX_KERNEL_NAME_POW_U8F16TOU8_2D CVIVANTE_NAMESPACE("evis.pow_U8F16toU8_2D") -#define VX_KERNEL_NAME_POW_I8F16TOI8 CVIVANTE_NAMESPACE("evis.pow_I8F16toI8") -#define VX_KERNEL_NAME_POW_I8F16TOI8_2D CVIVANTE_NAMESPACE("evis.pow_I8F16toI8_2D") -#define VX_KERNEL_NAME_POW_I16F16TOI16 CVIVANTE_NAMESPACE("evis.pow_I16F16toI16") -#define VX_KERNEL_NAME_POW_I16F16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_I16F16toI16_2D") -#define VX_KERNEL_NAME_POW_U8U8TOU8 CVIVANTE_NAMESPACE("evis.pow_U8U8toU8") -#define VX_KERNEL_NAME_POW_U8U8TOU8_2D CVIVANTE_NAMESPACE("evis.pow_U8U8toU8_2D") -#define VX_KERNEL_NAME_POW_I8I8TOI8 CVIVANTE_NAMESPACE("evis.pow_I8I8toI8") -#define VX_KERNEL_NAME_POW_I8I8TOI8_2D CVIVANTE_NAMESPACE("evis.pow_I8I8toI8_2D") -#define VX_KERNEL_NAME_POW_I16I16TOI16 CVIVANTE_NAMESPACE("evis.pow_I16I16toI16") -#define VX_KERNEL_NAME_POW_I16I16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_I16I16toI16_2D") -#define VX_KERNEL_NAME_POW_BF16BF16TOBF16 CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16") -#define VX_KERNEL_NAME_POW_BF16BF16TOBF16_2D CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16_2D") -#define VX_KERNEL_NAME_POW_U8U8TOF16 CVIVANTE_NAMESPACE("evis.pow_U8U8toF16") -#define VX_KERNEL_NAME_POW_U8U8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_U8U8toF16_2D") - -#define KERNEL_SOURCE_1 "pow_fp16", -#define KERNEL_SOURCE_2 "pow_fp16_i8", -#define KERNEL_SOURCE_3 "pow_fp16_i16", -#define KERNEL_SOURCE_4 "pow_u8", -#define KERNEL_SOURCE_5 "pow_i8", -#define KERNEL_SOURCE_6 "pow_i16" - +#define KERNEL_SOURCE "pow", #define HASH_POW_KEY(_input0_type, _input1_type, _output_type, _image_2d) \ ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d)) -#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ +#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \ - VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \ - SOURCE }, + CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE), \ + KERNEL_SOURCE }, -#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \ +#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \ { HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \ - VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \ - SOURCE }, + CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE"_2D"), \ + KERNEL_SOURCE }, static const struct { uint32_t key; @@ -108,59 +59,59 @@ static const struct { const char* source_name; } pow_map[] = { - TENSOR_POW_KERNELS(F16, F16, F16, KERNEL_SOURCE_1) - TENSOR_POW_KERNELS(F16, F16, U8, KERNEL_SOURCE_1) - TENSOR_POW_KERNELS(F16, U8, F16, KERNEL_SOURCE_1) - TENSOR_POW_KERNELS(F16, U8, U8, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS(F16, F16, F16) + TENSOR_POW_KERNELS(F16, F16, U8) + TENSOR_POW_KERNELS(F16, U8, F16) + TENSOR_POW_KERNELS(F16, U8, U8) - TENSOR_POW_KERNELS(F16, F16, I8, KERNEL_SOURCE_2) - TENSOR_POW_KERNELS(F16, I8, F16, KERNEL_SOURCE_2) - TENSOR_POW_KERNELS(F16, I8, I8, KERNEL_SOURCE_2) + TENSOR_POW_KERNELS(F16, F16, I8) + TENSOR_POW_KERNELS(F16, I8, F16) + TENSOR_POW_KERNELS(F16, I8, I8) - TENSOR_POW_KERNELS(F16, F16, I16, KERNEL_SOURCE_3) - TENSOR_POW_KERNELS(F16, I16, F16, KERNEL_SOURCE_3) - TENSOR_POW_KERNELS(F16, I16, I16, KERNEL_SOURCE_3) + TENSOR_POW_KERNELS(F16, F16, I16) + TENSOR_POW_KERNELS(F16, I16, F16) + TENSOR_POW_KERNELS(F16, I16, I16) - TENSOR_POW_KERNELS(U8, F16, F16, KERNEL_SOURCE_4) - TENSOR_POW_KERNELS(U8, F16, U8, KERNEL_SOURCE_4) - TENSOR_POW_KERNELS(U8, U8, U8, KERNEL_SOURCE_4) - TENSOR_POW_KERNELS(U8, U8, F16, KERNEL_SOURCE_4) + TENSOR_POW_KERNELS(U8, F16, F16) + TENSOR_POW_KERNELS(U8, F16, U8) + TENSOR_POW_KERNELS(U8, U8, U8) + TENSOR_POW_KERNELS(U8, U8, F16) - TENSOR_POW_KERNELS(I8, F16, F16, KERNEL_SOURCE_5) - TENSOR_POW_KERNELS(I8, F16, I8, KERNEL_SOURCE_5) - TENSOR_POW_KERNELS(I8, I8, I8, KERNEL_SOURCE_5) + TENSOR_POW_KERNELS(I8, F16, F16) + TENSOR_POW_KERNELS(I8, F16, I8) + TENSOR_POW_KERNELS(I8, I8, I8) - TENSOR_POW_KERNELS(I16, F16, F16, KERNEL_SOURCE_6) - TENSOR_POW_KERNELS(I16, F16, I16, KERNEL_SOURCE_6) - TENSOR_POW_KERNELS(I16, I16, I16, KERNEL_SOURCE_6) - TENSOR_POW_KERNELS(BF16, BF16, BF16, KERNEL_SOURCE_3) + TENSOR_POW_KERNELS(I16, F16, F16) + TENSOR_POW_KERNELS(I16, F16, I16) + TENSOR_POW_KERNELS(I16, I16, I16) + TENSOR_POW_KERNELS(BF16, BF16, BF16) - TENSOR_POW_KERNELS_2D(F16, F16, F16, KERNEL_SOURCE_1) - TENSOR_POW_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_1) - TENSOR_POW_KERNELS_2D(F16, U8, F16, KERNEL_SOURCE_1) - TENSOR_POW_KERNELS_2D(F16, U8, U8, KERNEL_SOURCE_1) + TENSOR_POW_KERNELS_2D(F16, F16, F16) + TENSOR_POW_KERNELS_2D(F16, F16, U8) + TENSOR_POW_KERNELS_2D(F16, U8, F16) + TENSOR_POW_KERNELS_2D(F16, U8, U8) - TENSOR_POW_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_2) - TENSOR_POW_KERNELS_2D(F16, I8, F16, KERNEL_SOURCE_2) - TENSOR_POW_KERNELS_2D(F16, I8, I8, KERNEL_SOURCE_2) + TENSOR_POW_KERNELS_2D(F16, F16, I8) + TENSOR_POW_KERNELS_2D(F16, I8, F16) + TENSOR_POW_KERNELS_2D(F16, I8, I8) - TENSOR_POW_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3) - TENSOR_POW_KERNELS_2D(F16, I16, F16, KERNEL_SOURCE_3) - TENSOR_POW_KERNELS_2D(F16, I16, I16, KERNEL_SOURCE_3) + TENSOR_POW_KERNELS_2D(F16, F16, I16) + TENSOR_POW_KERNELS_2D(F16, I16, F16) + TENSOR_POW_KERNELS_2D(F16, I16, I16) - TENSOR_POW_KERNELS_2D(U8, F16, F16, KERNEL_SOURCE_4) - TENSOR_POW_KERNELS_2D(U8, F16, U8, KERNEL_SOURCE_4) - TENSOR_POW_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_4) - TENSOR_POW_KERNELS_2D(U8, U8, F16, KERNEL_SOURCE_4) + TENSOR_POW_KERNELS_2D(U8, F16, F16) + TENSOR_POW_KERNELS_2D(U8, F16, U8) + TENSOR_POW_KERNELS_2D(U8, U8, U8) + TENSOR_POW_KERNELS_2D(U8, U8, F16) - TENSOR_POW_KERNELS_2D(I8, F16, F16, KERNEL_SOURCE_5) - TENSOR_POW_KERNELS_2D(I8, F16, I8, KERNEL_SOURCE_5) - TENSOR_POW_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_5) + TENSOR_POW_KERNELS_2D(I8, F16, F16) + TENSOR_POW_KERNELS_2D(I8, F16, I8) + TENSOR_POW_KERNELS_2D(I8, I8, I8) - TENSOR_POW_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_6) - TENSOR_POW_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_6) - TENSOR_POW_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_6) - TENSOR_POW_KERNELS_2D(BF16, BF16, BF16, KERNEL_SOURCE_3) + TENSOR_POW_KERNELS_2D(I16, F16, F16) + TENSOR_POW_KERNELS_2D(I16, F16, I16) + TENSOR_POW_KERNELS_2D(I16, I16, I16) + TENSOR_POW_KERNELS_2D(BF16, BF16, BF16) }; static vx_param_description_t vxPowKernel_param_def[] = @@ -186,24 +137,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) {0, 0, 0}, // localWorkSize: local group size in thread {0, 0, 0}}; // globalWorkSize: image size in thread - int8_t in0_fl = 0; - int32_t src0ZP = 0; - float src0Scale = 1.0f; - int8_t in1_fl = 0; - int32_t src1ZP = 0; - float src1Scale = 1.0f; - int8_t out_fl = 0; - float dstZP = 0; - float dstScale = 1.0f; + float input0_scale = 1.0f; + float input1_scale = 1.0f; + float input0_tail = 0; + float input1_tail = 0; + float output_scale = 1.0f; + float output_zp = 0; - int32_t postshift0 = 0; - int32_t postshift1 = 0; - float outScale_fl = 1; - - uint16_t M0 = 0; - uint16_t M1 = 0; - - vsi_size_t zAx = 1; uint32_t pack_key = 0; // dim number ??? vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; @@ -220,58 +160,59 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { - in0_fl = (int8_t)attr[0]->dfp.fl; - postshift0 = in0_fl - 0; + int32_t fl = attr[0]->dfp.fl; + if (fl > 0) + { + input0_scale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + input0_scale = (float)((int64_t)1 << -fl); + } } else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM ) { - src0ZP = attr[0]->asymm.zero_point; - src0Scale = attr[0]->asymm.scale; - - gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postshift0); + input0_scale = attr[0]->asymm.scale; + input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale; } if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP ) { - in1_fl = (int8_t)attr[1]->dfp.fl; - postshift1 = in1_fl - 0; + int32_t fl = attr[1]->dfp.fl; + if (fl > 0) + { + input1_scale = 1.0f / (float) ((int64_t)1 << fl); + } + else + { + input1_scale = (float)((int64_t)1 << -fl); + } } else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM) { - src1ZP = attr[1]->asymm.zero_point; - src1Scale = attr[1]->asymm.scale; - - gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postshift1); + input1_scale = attr[1]->asymm.scale; + input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale; } if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP ) { - out_fl = (int8_t)attr[2]->dfp.fl; - if (out_fl > 0) + int32_t fl = attr[2]->dfp.fl; + if (fl > 0) { - outScale_fl = (vx_float32)((int64_t)1 << out_fl); + output_scale = (float) ((int64_t)1 << fl); } else { - outScale_fl = (1.0f / (vx_float32)((int64_t)1 << -out_fl)); + output_scale = 1.0f / (float)((int64_t)1 << -fl); } } else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM ) { - dstZP = (float)attr[2]->asymm.zero_point; - dstScale = 1.0f / attr[2]->asymm.scale; - } - - if ( out_shape->size < 3 ) - { - zAx = 1; - } - else - { - zAx = out_shape->data[2]; + output_zp = (float)attr[2]->asymm.zero_point; + output_scale = 1.0f / attr[2]->asymm.scale; } #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \ @@ -287,269 +228,122 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) / shaderParam.global_scale[0], 4); shaderParam.global_size[1] = gpu_align_p2((out_shape->data[1] + shaderParam.global_scale[1] - 1) / shaderParam.global_scale[1], 2); - shaderParam.global_size[2] = gpu_align_p2((zAx + shaderParam.global_scale[2] - 1) - / shaderParam.global_scale[2], 1); + shaderParam.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1; status = vsi_nn_kernel_gpu_config( node, &shaderParam ); CHECK_STATUS_FAIL_GOTO(status, OnError); + switch( pack_key ) { - gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00050004, 0x00070006, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertFstDataToFp32_4x4_2 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00010000, 0x00030002, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertSecDataToFp32_4x4_2 = {{ - 0x01010101, // TCfg - 0x00000000, // ASelt - 0x00050004, 0x00070006, // ABin - 0x02020202, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{ - 0x33333333, // TCfg - 0x11110000, // ASelt - 0x03020100, 0x03020100, // ABin - 0x00000000, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00002400, // AccumType, ConstantType, and PostShift - 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{ - 0x09090909, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00000000, 0x00010001, 0x00000000, - 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4 = {{ - 0x09090909, // TCfg - 0x04040404, // ASelt - 0x00050004, 0x00070006, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00000000, 0x00010001, 0x00000000, - 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4_2 = {{ - 0x09090909, // TCfg - 0x04040404, // ASelt - 0x00010000, 0x00030002, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00000000, 0x00010001, 0x00000000, - 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4_2 = {{ - 0x09090909, // TCfg - 0x04040404, // ASelt - 0x00050004, 0x00070006, // ABin - 0x0a0a0a0a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00000000, 0x00010001, 0x00000000, - 0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{ - 0x11111111, // TCfg - 0x11110000, // ASelt - 0x06040200, 0x06040200, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000100, // AccumType, ConstantType, and PostShift - 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant - }, GPU_DP_TYPE_16 }; - - gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ - 0x11111111, // TCfg - 0x01010101, // ASelt - 0x01050004, 0x03070206, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ - 0x11111111, // TCfg - 0x01010101, // ASelt - 0x05050404, 0x07070606, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniExtractOddData_2x8 = {{ - 0x11111111, // TCfg - 0x11110000, // ASelt - 0x07050301, 0x07050301, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{ - 0x11111111, // TCfg - 0x11110000, // ASelt - 0x06040200, 0x06040200, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - uint32_t multiplierA = (M0 << 16) | M0; - uint32_t multiplierB = (M1 << 16) | M1; - int32_t i = 8; - - uniConvertUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F); - uniConvertSecUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F); - uniConvertUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F); - uniConvertSecUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F); - for ( i = 8; i < 16; i += 2 ) + case _PACK_SELECT_KEY( BF16, BF16, BF16 ): { - uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA; - uniConvertSecUint8SubZpToFp32_4x4.data[i] = multiplierA; - uniConvertUint8SubZpToFp32_4x4_2.data[i] = multiplierB; - uniConvertSecUint8SubZpToFp32_4x4_2.data[i] = multiplierB; + gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x01050004, 0x03070206, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{ + 0x11111111, // TCfg + 0x01010101, // ASelt + 0x05050404, 0x07070606, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtractOddData_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x07050301, 0x07050301, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant + }, GPU_DP_TYPE_16 }; + status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", + &uniConvBF16toF32_Part0_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", + &uniConvBF16toF32_Part1_2x8); + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", + &uniExtractOddData_2x8); + CHECK_STATUS_FAIL_GOTO(status, OnError ); } - - if ( attr[0]->dtype == I8 || attr[0]->dtype == I16 ) + break; + default: { - gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4, postshift0 ); - gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4, postshift0 ); - } + gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00010000, 0x00030002, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{ + 0x01010101, // TCfg + 0x00000000, // ASelt + 0x00050004, 0x00070006, // ABin + 0x02020202, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000600, // AccumType, ConstantType, and PostShift + 0x00000001, 0x00000000, 0x00000001, 0x00000000, + 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant + }, GPU_DP_TYPE_16 }; + gpu_dp_inst_t uniExtact8Bit_2x8 = {{ + 0x33333333, // TCfg + 0x11110000, // ASelt + 0x03020100, 0x03020100, // ABin + 0x00000000, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00002400, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniExtactHalf8_2x8 = {{ + 0x11111111, // TCfg + 0x11110000, // ASelt + 0x06040200, 0x06040200, // ABin + 0x22222222, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000100, // AccumType, ConstantType, and PostShift + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, + 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant + }, GPU_DP_TYPE_16}; - if ( attr[1]->dtype == I8 || attr[1]->dtype == I16 ) - { - gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4_2, postshift1 ); - gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4_2, postshift1 ); + status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4", + &uniConvertFstDataToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4", + &uniConvertSecDataToFp32_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_scale", &input0_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_scale", &input1_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "input0_tail", &input0_tail); + status |= vsi_nn_kernel_gpu_add_param( node, "input1_tail", &input1_tail); + status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale); + status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp); + if (attr[2]->dtype == F16) + { + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8", + &uniExtactHalf8_2x8); + } + else + { + status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8", + &uniExtact8Bit_2x8); + } + CHECK_STATUS_FAIL_GOTO(status, OnError ); } - - switch( pack_key ) - { - case _PACK_SELECT_KEY( F16, F16, I8 ): - case _PACK_SELECT_KEY( F16, I8, F16 ): - case _PACK_SELECT_KEY( F16, I8, I8 ): - case _PACK_SELECT_KEY( F16, F16, I16 ): - case _PACK_SELECT_KEY( F16, I16, F16 ): - case _PACK_SELECT_KEY( F16, I16, I16 ): - case _PACK_SELECT_KEY( I8, F16, F16 ): - case _PACK_SELECT_KEY( I8, F16, I8 ): - case _PACK_SELECT_KEY( I8, I8, I8 ): - case _PACK_SELECT_KEY( I16, F16, F16 ): - case _PACK_SELECT_KEY( I16, F16, I16 ): - case _PACK_SELECT_KEY( I16, I16, I16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4", - &uniConvertFstDataToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4", - &uniConvertSecDataToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2", - &uniConvertFstDataToFp32_4x4_2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2", - &uniConvertSecDataToFp32_4x4_2); - status |= vsi_nn_kernel_gpu_add_param(node, "outScale_fl", &outScale_fl); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( U8, F16, F16 ): - case _PACK_SELECT_KEY( U8, F16, U8 ): - case _PACK_SELECT_KEY( U8, U8, U8 ): - case _PACK_SELECT_KEY( U8, U8, F16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4", - &uniConvertUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4", - &uniConvertSecUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2", - &uniConvertFstDataToFp32_4x4_2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2", - &uniConvertSecDataToFp32_4x4_2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2", - &uniConvertUint8SubZpToFp32_4x4_2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2", - &uniConvertSecUint8SubZpToFp32_4x4_2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", - &uniConvertHalftoFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP0", &src0ZP); - status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP); - status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( F16, F16, F16 ): - case _PACK_SELECT_KEY( F16, F16, U8 ): - case _PACK_SELECT_KEY( F16, U8, F16 ): - case _PACK_SELECT_KEY( F16, U8, U8 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4", - &uniConvertFstDataToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4", - &uniConvertSecDataToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2", - &uniConvertUint8SubZpToFp32_4x4_2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2", - &uniConvertSecUint8SubZpToFp32_4x4_2); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", - &uniConvertHalfToFp16_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP); - status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP); - status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - break; - case _PACK_SELECT_KEY( BF16, BF16, BF16 ): - { - status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", - &uniConvBF16toF32_Part0_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", - &uniConvBF16toF32_Part1_2x8); - status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8", - &uniExtractOddData_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError ); - } - default: - break; - } -#undef _PACK_SELECT_KEY - status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8); - CHECK_STATUS_FAIL_GOTO(status, OnError ); + break; } +#undef _PACK_SELECT_KEY OnError: if ( attr[0] ) @@ -646,7 +440,6 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_POW_PARAM_NUM, inputs, 2, outputs, 1 ); status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_POW_PARAM_NUM ); - } } return node; @@ -655,4 +448,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( pow, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c index c543f96..498ee45 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c @@ -126,8 +126,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; - dstZP = attr[0]->asymm.zero_point; - outputScale = attr[0]->asymm.scale; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); @@ -152,7 +150,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) } else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - outputScale = 1.0f/outputScale; + outputScale = 1.0f / attr[0]->asymm.scale; + dstZP = attr[0]->asymm.zero_point; } else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) { diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c index 2201205..797c925 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c @@ -128,8 +128,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer) CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); out_shape = attr[0]->shape; - dstZP = (float)attr[0]->asymm.zero_point; - outputScale = attr[0]->asymm.scale; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); @@ -147,7 +145,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer) } else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - outputScale = 1.0f/outputScale; + outputScale = 1.0f / attr[0]->asymm.scale; + dstZP = (float)attr[0]->asymm.zero_point; } else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) { diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c index 23ae619..e92b248 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c @@ -148,8 +148,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; - dstZP = attr[0]->asymm.zero_point; - dstScale = attr[0]->asymm.scale; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); @@ -161,7 +159,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - dstScale = 1.0f / dstScale; + dstScale = 1.0f / attr[0]->asymm.scale; + dstZP = attr[0]->asymm.zero_point; } else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) { diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c index 4089e0c..ddfc9b5 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c @@ -35,13 +35,15 @@ #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" __BEGIN_DECLS -#define KERNEL_SOURCE_0 "pre_process_rgb888_planar_0", -#define KERNEL_SOURCE_1 "pre_process_rgb888_planar_1", -#define KERNEL_SOURCE_2 "pre_process_rgb888_planar_2", +#define RGB888_SEP_SOURCE_0 "pre_process_rgb888_planar_sep_0", +#define RGB888_SEP_SOURCE_1 "pre_process_rgb888_planar_sep_1", +#define RGB888_SEP_SOURCE_2 "pre_process_rgb888_planar_sep_2", +#define RGB888_SOURCE_0 "pre_process_rgb888_planar_0", +#define RGB888_SOURCE_1 "pre_process_rgb888_planar_1", +#define RGB888_SOURCE_2 "pre_process_rgb888_planar_2", #define STR(a) #a @@ -53,28 +55,48 @@ typedef enum HALF } _internal_scale_e; // Add kernel hashtable here -#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE_FLAG ) \ - (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | (SCALE_FLAG)) +#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SEP, SCALE_FLAG ) \ + (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | ( SEP << 4 ) | (SCALE_FLAG)) #define PACK_KERNEL_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \ - { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE ), \ - CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ - KERNEL_SOURCE_0 } + { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, SCALE ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + RGB888_SOURCE_0 } + +#define PACK_KERNEL_SEP_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, SCALE ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + RGB888_SEP_SOURCE_0 } #define PACK_KERNEL_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \ - { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, COPY ), \ - CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ - KERNEL_SOURCE_1 } + { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, COPY ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + RGB888_SOURCE_1 } + +#define PACK_KERNEL_SEP_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, COPY ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + RGB888_SEP_SOURCE_1 } #define PACK_KERNEL_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \ - { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, FOUR_OVER_THREE ), \ - CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ - KERNEL_SOURCE_2 } + { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, FOUR_OVER_THREE ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + RGB888_SOURCE_2 } + +#define PACK_KERNEL_SEP_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, FOUR_OVER_THREE ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + RGB888_SEP_SOURCE_2 } #define PACK_KERNEL_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \ - { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, HALF ), \ - CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ - KERNEL_SOURCE_2 } + { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, HALF ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + RGB888_SOURCE_2 } + +#define PACK_KERNEL_SEP_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \ + { PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, HALF ), \ + CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \ + RGB888_SEP_SOURCE_2 } typedef struct { @@ -98,6 +120,19 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] = PACK_KERNEL_4_OVER_3_MAP( U8, U8 ), PACK_KERNEL_HALF_MAP( U8, U8 ), + + PACK_KERNEL_SEP_SCALE_MAP( U8, F16 ), + PACK_KERNEL_SEP_SCALE_MAP( U8, I16 ), + PACK_KERNEL_SEP_SCALE_MAP( U8, I8 ), + PACK_KERNEL_SEP_SCALE_MAP( U8, U8 ), + + PACK_KERNEL_SEP_COPY_MAP( U8, F16 ), + PACK_KERNEL_SEP_COPY_MAP( U8, I16 ), + PACK_KERNEL_SEP_COPY_MAP( U8, I8 ), + PACK_KERNEL_SEP_COPY_MAP( U8, U8 ), + + PACK_KERNEL_SEP_4_OVER_3_MAP( U8, U8 ), + PACK_KERNEL_SEP_HALF_MAP( U8, U8 ), }; @@ -105,6 +140,23 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] = * Kernel params */ static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def ) + +static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[] = { {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, @@ -121,7 +173,7 @@ static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] = {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, }; -#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def ) +#define _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ) /* * Kernel initializer @@ -149,9 +201,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; - attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )) + { + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + } + else + { + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + } CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale); CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; @@ -310,9 +369,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; vsi_size_array_t * out_shape = NULL; - attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )) + { + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + } + else + { + attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + } CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale); + status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale); CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; @@ -406,7 +472,14 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer) attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )) + { + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); + } + else + { + attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + } CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); out_shape = attr[1]->shape; @@ -540,6 +613,7 @@ static vsi_status _query_kernel vsi_bool is_4_over_3 = FALSE; vsi_bool is_half_scale = FALSE; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); + vsi_bool is_rgb888_sep = (vsi_bool)(inputs[1] != NULL); is_4_over_3 = (width * 3 == (int32_t)outputs[0]->attr.size[0] * 4) && (height * 3 == (int32_t)outputs[0]->attr.size[1] * 4); @@ -568,7 +642,7 @@ static vsi_status _query_kernel } } - key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, scale_type); + key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, is_rgb888_sep, scale_type); for ( i = 0; i < _cnt_of_array(pre_process_rgb888_planar_kernel_map); i ++ ) { @@ -581,8 +655,17 @@ static vsi_status _query_kernel { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", pre_process_rgb888_planar_kernel_map[i].function_name ); - kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def; - kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def ); + + if (is_rgb888_sep) + { + kernel->info.parameters = _pre_process_rgb888_planar_sep_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ); + } + else + { + kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def; + kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def ); + } if (enable_copy) { @@ -620,8 +703,9 @@ static vsi_nn_kernel_node_t _setup ) { vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t node_params[_PRE_PROCESS_RGB888_PLANAR_PARAM_NUM]; + vsi_nn_kernel_node_param_t* node_params = NULL; vsi_nn_kernel_node_t node = NULL; + int32_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM; int32_t width = vsi_nn_kernel_param_get_int32( params, "width" ); int32_t height = vsi_nn_kernel_param_get_int32( params, "height" ); float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" ); @@ -630,7 +714,10 @@ static vsi_nn_kernel_node_t _setup float scale = vsi_nn_kernel_param_get_float32( params, "scale" ); vsi_bool is_no_range_change = FALSE; - if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, + input_num = inputs[1] == NULL ? 1 : input_num; + param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count; + + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -648,17 +735,19 @@ static vsi_nn_kernel_node_t _setup status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height ); if ( VSI_SUCCESS == status) { + node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count); node = vsi_nn_kernel_create_node( graph, kernel ); if ( node ) { - uint32_t index = 6; + uint32_t index = inputs[1] == NULL ? 4 : 6; + uint32_t scalar_index = index; int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); /* Set inputs and outputs */ - vsi_nn_kernel_node_pack_io( node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM, + vsi_nn_kernel_node_pack_io( node_params, param_count, inputs, input_num, outputs, output_num ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x ); @@ -670,17 +759,21 @@ static vsi_nn_kernel_node_t _setup node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean ); node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale ); /* Pass parameters to node. */ - status = vsi_nn_kernel_node_pass_param( node, node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM ); - vsi_nn_kernel_scalar_release( &node_params[6] ); - vsi_nn_kernel_scalar_release( &node_params[7] ); - vsi_nn_kernel_scalar_release( &node_params[8] ); - vsi_nn_kernel_scalar_release( &node_params[9] ); - vsi_nn_kernel_scalar_release( &node_params[10] ); - vsi_nn_kernel_scalar_release( &node_params[11] ); - vsi_nn_kernel_scalar_release( &node_params[12] ); - vsi_nn_kernel_scalar_release( &node_params[13] ); + status = vsi_nn_kernel_node_pass_param( node, node_params, param_count ); + index = scalar_index; + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); + vsi_nn_kernel_scalar_release( &node_params[index++] ); } } + + vsi_nn_safe_free(node_params); + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c index 4181414..5fda281 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c @@ -150,8 +150,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; - outputZP = (float)attr[0]->asymm.zero_point; - outputScale = attr[0]->asymm.scale; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); @@ -176,7 +174,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) } else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - outputScale = 1.0f / outputScale; + outputScale = 1.0f / attr[0]->asymm.scale; + outputZP = (float)attr[0]->asymm.zero_point; } else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE ) { diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c index 7a5c50c..a51eab1 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c @@ -135,8 +135,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; - dstZP = attr[0]->asymm.zero_point; - dstScale = attr[0]->asymm.scale; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); @@ -151,9 +149,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) width = width / 3; } - if (attr[0]->dtype == U8) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - dstScale = 1.0f / dstScale; + dstScale = 1.0f / attr[0]->asymm.scale; + dstZP = attr[0]->asymm.zero_point; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); + } + dstZP = 0; } shaderParam.global_scale[0] = 16; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c index d96e81d..7c7efc7 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c @@ -130,8 +130,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); out_shape = attr[0]->shape; - dstZP = attr[0]->asymm.zero_point; - dstScale = attr[0]->asymm.scale; width = (uint32_t)(out_shape->data[0]); height = (uint32_t)(out_shape->data[1]); @@ -141,9 +139,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) order1 = 0; } - if (attr[0]->dtype == U8) + if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { - dstScale = 1.0f / dstScale; + dstScale = 1.0f / attr[0]->asymm.scale; + dstZP = attr[0]->asymm.zero_point; + } + else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP) + { + if (attr[0]->dfp.fl > 0) + { + dstScale = (float)((int64_t)1 << attr[0]->dfp.fl); + } + else + { + dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl)); + } + dstZP = 0; } shaderParam.global_scale[0] = 16; diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index 394461f..6896307 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -51,6 +51,7 @@ typedef enum UP_3X_HALF, UP_4X_HALF, UP_8X_HALF, + UP_8X_ALIGN, } _internal_scale_e; #define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_bilinear_"#_input_type @@ -102,6 +103,12 @@ typedef enum "_SAME_3x_upsample_half_pixel_centers"), \ _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) } +#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \ + { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_SAME_8x_upsample_align_corners"), \ + "resize_bilinear_align_corners" } + typedef struct { uint32_t key; @@ -128,6 +135,7 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] = PACK_KERNEL_MAP_UP_3X_HALF(U8, U8), PACK_KERNEL_MAP_UP_4X_HALF(U8, U8), PACK_KERNEL_MAP_UP_8X_HALF(U8, U8), + PACK_KERNEL_MAP_UP_8X_ALIGN(U8, U8), }; @@ -228,11 +236,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) uint32_t out_height; float half_pixel_value = 0.0f; vsi_bool is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size); - vsi_bool is_half_pixel_centers = FALSE; - vsi_bool is_2x_up_kernel = FALSE; - vsi_bool is_3x_up_kernel = FALSE; - vsi_bool is_4x_up_kernel = FALSE; - vsi_bool is_8x_up_kernel = FALSE; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); @@ -257,20 +260,20 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) if (align_corners && out_width > 1) { - scale_factor[0] = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1); + scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1); } else { - scale_factor[0] = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width; + scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width; } if (align_corners && out_height > 1) { - scale_factor[1] = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1); + scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1); } else { - scale_factor[1] = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height; + scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height; } if (half_pixel_centers) @@ -282,16 +285,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) half_pixel_value = 0.0f; } - is_half_pixel_centers = (!align_corners) && (half_pixel_centers); - - if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)) && is_half_pixel_centers) - { - is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height); - is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height); - is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height); - is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height); - } - if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant ) { input_scale = input_attr->asymm.scale; @@ -302,11 +295,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) srcFixPointPos = input_attr->dfp.fl; if (srcFixPointPos >= 0) { - input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos); + input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos); } else if (srcFixPointPos < 0) { - input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos); + input_scale = (float)((int64_t)1 << -srcFixPointPos); } inputZP = 0; } @@ -326,11 +319,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) dstFixPointPos = output_attr->dfp.fl; if (dstFixPointPos >= 0) { - output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos); + output_scale = (float) ((int64_t)1 << dstFixPointPos); } else if (dstFixPointPos < 0) { - output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos); + output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos); } outputZP = 0; } @@ -340,226 +333,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) outputZP = 0; } - if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel) - { - gpu_param.global_scale[0] = 16; - gpu_param.global_scale[1] = 1; - } - else if (is_3x_up_kernel) - { - gpu_param.global_scale[0] = 15; - gpu_param.global_scale[1] = 6; - gpu_param.global_scale[2] = 1; - } - else - { - gpu_param.global_scale[0] = 4; - gpu_param.global_scale[1] = 1; - gpu_param.global_scale[2] = 1; - } + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; - if (is_2x_up_kernel) - { - gpu_dp_inst_t uniResize2xUp_0_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect - 0x00000704, // AccumType, ConstantType, and PostShift - 0x09030301, 0x03090103, 0x09030301, 0x03090103, - 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize2xUp_1_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect - 0x00000704, // AccumType, ConstantType, and PostShift - 0x09030301, 0x03090103, 0x09030301, 0x03090103, - 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant - }, GPU_DP_TYPE_16}; - - status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if (is_3x_up_kernel) - { - gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{ - 0x15515515, // TCfg - 0x00000000, // ASelt - 0x21210110, 0x03323202, // ABin - 0x2aa2aa2a, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000610, // AccumType, ConstantType, and PostShift - 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555, - 0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{ - 0x05155155, // TCfg - 0x00000000, // ASelt - 0x54044343, 0x00650554, // ABin - 0x0a2aa2aa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000610, // AccumType, ConstantType, and PostShift - 0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa, - 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{ - 0x55551155, // TCfg - 0x50501050, // ASelt - 0x01011010, 0x21212121, // ABin - 0xaaaa22aa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x0000060f, // AccumType, ConstantType, and PostShift - 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, - 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{ - 0x11555511, // TCfg - 0x10505010, // ASelt - 0x32320202, 0x03033232, // ABin - 0x22aaaa22, // BSelt - 0x00000000, 0x00000000, // BBin - 0x0000060f, // AccumType, ConstantType, and PostShift - 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72, - 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{ - 0x55115555, // TCfg - 0x50105050, // ASelt - 0x43434343, 0x54540404, // ABin - 0xaa22aaaa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x0000060f, // AccumType, ConstantType, and PostShift - 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39, - 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{ - 0x00551155, // TCfg - 0x00501050, // ASelt - 0x05055454, 0x00006565, // ABin - 0x00aa22aa, // BSelt - 0x00000000, 0x00000000, // BBin - 0x0000060f, // AccumType, ConstantType, and PostShift - 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, - 0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant - }, GPU_DP_TYPE_16}; - - status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if (is_4x_up_kernel) - { - gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect - 0x00000406, // AccumType, ConstantType, and PostShift - 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, - 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect - 0x00000406, // AccumType, ConstantType, and PostShift - 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, - 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect - 0x00000406, // AccumType, ConstantType, and PostShift - 0x23150503, 0x31070701, 0x07310107, 0x15230305, - 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect - 0x00000406, // AccumType, ConstantType, and PostShift - 0x23150503, 0x31070701, 0x07310107, 0x15230305, - 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant - }, GPU_DP_TYPE_16}; - - status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if (is_8x_up_kernel) - { - gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, - 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, - 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, - 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, - 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, - 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, - 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, - 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant - }, GPU_DP_TYPE_16}; - gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{ - 0x55555555, 0x55555555, // TCfg - 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect - 0x00000708, // AccumType, ConstantType, and PostShift - 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, - 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant - }, GPU_DP_TYPE_16}; - - status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8); - status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) + if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant) { float dfpScale = input_scale * output_scale; gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{ @@ -840,7 +618,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype)) { float uint8Scale = 1.0f / output_scale; - float uint8ZP_out = (vx_float32)outputZP; + float uint8ZP_out = (float)outputZP; gpu_dp_inst_t uniExtact8Bit_2x8 = {{ 0x33333333, // TCfg 0x11110000, // ASelt @@ -1045,11 +823,299 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) goto final; } - if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel&& !is_8x_up_kernel) + status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value); + CHECK_STATUS_FAIL_GOTO(status, final ); + + gpu_param.global_size[0] = gpu_align_p2((out_width + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]; + gpu_param.global_size[2] = depth / gpu_param.global_scale[2]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _resize_bilinear_initializer() */ + +DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_size_array_t * out_shape = NULL; + vsi_size_array_t * in_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + uint32_t depth = 0; + uint32_t in_width = 0; + uint32_t in_height = 0; + uint32_t out_width = 0; + uint32_t out_height = 0; + vsi_bool is_2x_up_kernel = FALSE; + vsi_bool is_3x_up_kernel = FALSE; + vsi_bool is_4x_up_kernel = FALSE; + vsi_bool is_8x_up_kernel = FALSE; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + in_shape = input_attr->shape; + input_dtype = input_attr->dtype; + + in_width = (uint32_t)(in_shape->data[0]); + in_height = (uint32_t)(in_shape->data[1]); + depth = (uint32_t)(in_shape->data[2]); + out_width = (uint32_t)(out_shape->data[0]); + out_height = (uint32_t)(out_shape->data[1]); + + if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr))) { - status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value); + is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height); + is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height); + is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height); + is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height); + } + + if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel) + { + gpu_param.global_scale[0] = 16; + gpu_param.global_scale[1] = 1; + } + else if (is_3x_up_kernel) + { + gpu_param.global_scale[0] = 15; + gpu_param.global_scale[1] = 6; + gpu_param.global_scale[2] = 1; + } + else + { + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + if (is_2x_up_kernel) + { + gpu_dp_inst_t uniResize2xUp_0_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect + 0x00000704, // AccumType, ConstantType, and PostShift + 0x09030301, 0x03090103, 0x09030301, 0x03090103, + 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize2xUp_1_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect + 0x00000704, // AccumType, ConstantType, and PostShift + 0x09030301, 0x03090103, 0x09030301, 0x03090103, + 0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); CHECK_STATUS_FAIL_GOTO(status, final ); } + else if (is_3x_up_kernel) + { + gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{ + 0x15515515, // TCfg + 0x00000000, // ASelt + 0x21210110, 0x03323202, // ABin + 0x2aa2aa2a, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555, + 0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{ + 0x05155155, // TCfg + 0x00000000, // ASelt + 0x54044343, 0x00650554, // ABin + 0x0a2aa2aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x00000610, // AccumType, ConstantType, and PostShift + 0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa, + 0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{ + 0x55551155, // TCfg + 0x50501050, // ASelt + 0x01011010, 0x21212121, // ABin + 0xaaaa22aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, + 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{ + 0x11555511, // TCfg + 0x10505010, // ASelt + 0x32320202, 0x03033232, // ABin + 0x22aaaa22, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72, + 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{ + 0x55115555, // TCfg + 0x50105050, // ASelt + 0x43434343, 0x54540404, // ABin + 0xaa22aaaa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39, + 0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{ + 0x00551155, // TCfg + 0x00501050, // ASelt + 0x05055454, 0x00006565, // ABin + 0x00aa22aa, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab, + 0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (is_4x_up_kernel) + { + gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f, + 0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x23150503, 0x31070701, 0x07310107, 0x15230305, + 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x23150503, 0x31070701, 0x07310107, 0x15230305, + 0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (is_8x_up_kernel) + { + gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, + 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907, + 0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, + 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05, + 0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, + 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03, + 0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, + 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{ + 0x55555555, 0x55555555, // TCfg + 0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect + 0x00000708, // AccumType, ConstantType, and PostShift + 0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01, + 0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + VSILOGE("input or output's format is not support"); + status = VSI_FAILURE; + goto final; + } if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel) { @@ -1071,7 +1137,168 @@ final: if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); return status; -} /* _resize_bilinear_initializer() */ +} /* _bilinear_half_pixel_centers_opt_initializer() */ + +DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_size_array_t * out_shape = NULL; + vsi_size_array_t * in_shape = NULL; + vsi_nn_kernel_dtype_e input_dtype = F16; + uint32_t depth = 0; + float scale_factor[2] = {0}; + uint32_t in_width = 0; + uint32_t in_height = 0; + uint32_t out_width = 0; + uint32_t out_height = 0; + vsi_bool is_8x_align_corners = FALSE; + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + out_shape = output_attr->shape; + in_shape = input_attr->shape; + input_dtype = input_attr->dtype; + + in_width = (uint32_t)(in_shape->data[0]); + in_height = (uint32_t)(in_shape->data[1]); + depth = (uint32_t)(in_shape->data[2]); + out_width = (uint32_t)(out_shape->data[0]); + out_height = (uint32_t)(out_shape->data[1]); + + if (out_width > 1) + { + scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1); + } + else + { + scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width; + } + + if (out_height > 1) + { + scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1); + } + else + { + scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height; + } + + if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr))) + { + is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] = 0.125f); + } + + if (is_8x_align_corners) + { + gpu_param.global_scale[0] = 2; + gpu_param.global_scale[1] = 1; + gpu_param.global_scale[2] = 1; + } + + if (is_8x_align_corners) + { + gpu_dp_inst_t uniBilinear_8x_l10_4x8 = {{ + 0x55555505, 0x55555555, // TCfg + 0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x00000838, 0x01070731, 0x02060e2a, 0x03051523, + 0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniBilinear_8x_l11_4x8 = {{ + 0x55555505, 0x55555555, // TCfg + 0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x00000838, 0x01070731, 0x02060e2a, 0x03051523, + 0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniBilinear_8x_l20_4x8 = {{ + 0x55555505, 0x55555555, // TCfg + 0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e, + 0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniBilinear_8x_l21_4x8 = {{ + 0x55555505, 0x55555555, // TCfg + 0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e, + 0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniBilinear_8x_l30_4x8 = {{ + 0x55555505, 0x55555555, // TCfg + 0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19, + 0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniBilinear_8x_l31_4x8 = {{ + 0x55555505, 0x55555555, // TCfg + 0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19, + 0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniBilinear_8x_l40_4x8 = {{ + 0x55555505, 0x55555555, // TCfg + 0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x00002020, 0x041c041c, 0x08180818, 0x0c140c14, + 0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniBilinear_8x_l41_4x8 = {{ + 0x55555505, 0x55555555, // TCfg + 0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect + 0x00000406, // AccumType, ConstantType, and PostShift + 0x00002020, 0x041c041c, 0x08180818, 0x0c140c14, + 0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant + }, GPU_DP_TYPE_16}; + + status = vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l10_4x8", &uniBilinear_8x_l10_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l11_4x8", &uniBilinear_8x_l11_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l20_4x8", &uniBilinear_8x_l20_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l21_4x8", &uniBilinear_8x_l21_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l30_4x8", &uniBilinear_8x_l30_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l31_4x8", &uniBilinear_8x_l31_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l40_4x8", &uniBilinear_8x_l40_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l41_4x8", &uniBilinear_8x_l41_4x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + VSILOGE("input or output's format is not support"); + status = VSI_FAILURE; + goto final; + } + + gpu_param.global_size[0] = gpu_align_p2((in_width + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = (in_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]; + gpu_param.global_size[2] = depth / gpu_param.global_scale[2]; + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + return status; +} /* _bilinear_align_corners_opt_initializer() */ /* * Query kernel @@ -1098,19 +1325,46 @@ static vsi_status _query_kernel vx_kernel_initialize_f initializer = _resize_bilinear_initializer; uint32_t key; uint32_t i; - vsi_bool is_2x_upsample =(2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ - && (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); - vsi_bool is_3x_upsample =(3 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ - && (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); - vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ - && (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); - vsi_bool is_8x_upsample =(8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \ - && (8 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]); + float width_scale = 0; + float height_scale = 0; + vsi_size_t input_width = inputs[0]->attr.size[0]; + vsi_size_t input_height = inputs[0]->attr.size[1]; + vsi_size_t output_width = outputs[0]->attr.size[0]; + vsi_size_t output_height = outputs[0]->attr.size[1]; + vsi_bool is_2x_upsample =(2 * input_width == output_width) \ + && (2 * input_height == output_height); + vsi_bool is_3x_upsample =(3 * input_width == output_width) \ + && (3 * input_height == output_height); + vsi_bool is_4x_upsample =(4 * input_width == output_width) \ + && (4 * input_height == output_height); + vsi_bool is_8x_upsample =(8 * input_width == output_width) \ + && (8 * input_height == output_height); + vsi_bool is_8x_align_corners = FALSE; _internal_scale_e scale_flag = UP; + if (align_corners && outputs[0]->attr.size[0] > 1) + { + width_scale = ((float)(input_width - 1) * 1.0f) / (float)(output_width - 1); + } + else + { + width_scale = ((float)input_width * 1.0f) / (float)output_width; + } + + if (align_corners && output_height > 1) + { + height_scale = ((float)(input_height - 1) * 1.0f) / (float)(output_height - 1); + } + else + { + height_scale = ((float)input_height * 1.0f) / (float)output_height; + } + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + is_8x_align_corners = (vsi_bool)( width_scale == 0.125f && height_scale == 0.125f && in_dtype == U8 ); + is_2x_upsample &= (in_dtype == U8); is_3x_upsample &= (in_dtype == U8); is_4x_upsample &= (in_dtype == U8); @@ -1121,18 +1375,27 @@ static vsi_status _query_kernel if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample) { scale_flag = UP_2X_HALF; + initializer = _bilinear_half_pixel_centers_opt_initializer; } else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample) { scale_flag = UP_3X_HALF; + initializer = _bilinear_half_pixel_centers_opt_initializer; } else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample) { scale_flag = UP_4X_HALF; + initializer = _bilinear_half_pixel_centers_opt_initializer; } else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample) { scale_flag = UP_8X_HALF; + initializer = _bilinear_half_pixel_centers_opt_initializer; + } + else if (is_same_type && (align_corners) && (!half_pixel_centers) && is_8x_align_corners) + { + scale_flag = UP_8X_ALIGN; + initializer = _bilinear_align_corners_opt_initializer; } else if (is_same_type && is_evis2) { @@ -1240,20 +1503,20 @@ static vsi_nn_tensor_t* _create_scale_tensor if (align_corners && width > 1) { - width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1); + width_scale = ((float)(input_width - 1) * 1.0f) / (float)(width - 1); } else { - width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)width; + width_scale = ((float)input_width * 1.0f) / (float)width; } if (align_corners && height > 1) { - height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(height - 1); + height_scale = ((float)(input_height - 1) * 1.0f) / (float)(height - 1); } else { - height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)height; + height_scale = ((float)input_height * 1.0f) / (float)height; } @@ -1273,7 +1536,7 @@ static vsi_nn_tensor_t* _create_scale_tensor int32_t h0 = 0; if (half_pixel_centers) { - input_h = ((vx_float32)y + 0.5f) * height_scale - 0.5f; + input_h = ((float)y + 0.5f) * height_scale - 0.5f; } else { @@ -1291,7 +1554,7 @@ static vsi_nn_tensor_t* _create_scale_tensor float br = 0.0f; if (half_pixel_centers) { - input_w = ((vx_float32)x + 0.5f) * width_scale - 0.5f; + input_w = ((float)x + 0.5f) * width_scale - 0.5f; } else { diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c index 6e2e6bd..b8e634e 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c @@ -51,6 +51,15 @@ __BEGIN_DECLS "_"STR(UP_SCALE)"x_upsample_half_pixel_centers"), \ "resize_bilinear_nhwc" } +#define BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \ + (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (UP_SCALE << 16)) + +#define BILINEAR_NHWC_BOUND_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \ + { BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ), \ + CVIVANTE_NAMESPACE("evis.resize_bilinear_nhwc_bound_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \ + "_"STR(UP_SCALE)"x"), \ + "resize_bilinear_nhwc_bound" } + typedef struct { uint32_t key; @@ -65,6 +74,12 @@ static const _kernel_map_type _resize_bilinear_nhwc_kernel_map[] = BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 4), }; +static const _kernel_map_type _bilinear_nhwc_bound_kernel_map[] = +{ + BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 2), + BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 3), + BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 4), +}; /* * Kernel params @@ -81,6 +96,14 @@ static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] = #define SCALAR_ALIGN_CORNERS (2) #define SCALAR_HALF_PIXEL (3) +static vx_param_description_t _bilinear_nhwc_bound_kernel_param_def[] = +{ + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}, +}; +#define _BILINEAR_NHWC_BOUND_PARAM_NUM _cnt_of_array( _bilinear_nhwc_bound_kernel_param_def ) + /* * Kernel initializer */ @@ -382,50 +405,193 @@ final: return status; } /* _resize_bilinear_initializer() */ +DEF_KERNEL_INITIALIZER(_bilinear_nhwc_bound_initializer) + ( + vsi_nn_kernel_node_t node, + const vsi_nn_kernel_node_param_t * param, + size_t param_size + ) +{ + vsi_status status = VSI_FAILURE; + gpu_param_t gpu_param = { + 3, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + {0, 0, 0} + }; + vsi_nn_kernel_tensor_attr_t * output_attr = NULL; + vsi_nn_kernel_tensor_attr_t * input_attr = NULL; + vsi_size_array_t * in_shape = NULL; + vsi_size_array_t * out_shape = NULL; + uint32_t x_coord[2] = {0}; + uint32_t in_width; + uint32_t in_height; + uint32_t out_width; + uint32_t out_height; + vsi_bool is_2x_up_kernel = FALSE; + vsi_bool is_3x_up_kernel = FALSE; + vsi_bool is_4x_up_kernel = FALSE; + + + input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); + CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); + output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); + CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); + + in_shape = input_attr->shape; + out_shape = output_attr->shape; + + in_width = (uint32_t)(in_shape->data[0]); + in_height = (uint32_t)(in_shape->data[1]); + out_width = (uint32_t)(out_shape->data[0]); + out_height = (uint32_t)(out_shape->data[1]); + + is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height); + is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height); + is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height); + + + if (is_2x_up_kernel) + { + gpu_dp_inst_t uniResize_x2_nhwc2_0_4x8 = {{ + 0x55555511, 0x55555555, // TCfg + 0x46104000, 0x3a48829c, 0x4882acca, 0xc4acca3a, 0xbd4e5b50, // BinSelect + 0x00000704, // AccumType, ConstantType, and PostShift + 0x000c0004, 0x09030301, 0x03090103, 0x03090103, + 0x09030301, 0x09030301, 0x03090103, 0x03090103 // Constant + }, GPU_DP_TYPE_16}; + + gpu_param.global_scale[0] = 2; + gpu_param.global_scale[1] = 1; + x_coord[1] = (uint32_t)(out_shape->data[0]) - 2; + x_coord[0] = (x_coord[1] * 2 - 1) >> 2; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x2_nhwc2_0_4x8", &uniResize_x2_nhwc2_0_4x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (is_3x_up_kernel) + { + gpu_dp_inst_t uniResize_x3_nhwc2_l10_4x4 = {{ + 0x05055511, // TCfg + 0x04045010, // ASelt + 0x31310000, 0x00330022, // ABin + 0x0a0aaa22, // BSelt + 0x00000000, 0x00000000, // BBin + 0x0000060f, // AccumType, ConstantType, and PostShift + 0x00005556, 0x00002aab, 0x38e41c72, 0x1c720e39, + 0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant + }, GPU_DP_TYPE_16}; + + gpu_param.global_scale[0] = 3; + gpu_param.global_scale[1] = 1; + x_coord[1] = (uint32_t)(out_shape->data[0]) - 2; + x_coord[0] = (x_coord[1] - 1) / 6 * 2; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l10_4x4", &uniResize_x3_nhwc2_l10_4x4); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else if (is_4x_up_kernel) + { + gpu_dp_inst_t uniResize_x4_nhwc2_l00_4x8 = {{ + 0x55555511, 0x55555555, // TCfg + 0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect + 0x00000706, // AccumType, ConstantType, and PostShift + 0x00280018, 0x190f0f09, 0x23051503, 0x23051503, + 0x05230315, 0x05230315, 0x0f19090f, 0x0f19090f // Constant + }, GPU_DP_TYPE_16}; + gpu_dp_inst_t uniResize_x4_nhwc2_l10_4x8 = {{ + 0x55555511, 0x55555555, // TCfg + 0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect + 0x00000706, // AccumType, ConstantType, and PostShift + 0x00380008, 0x23150503, 0x31070701, 0x31070701, + 0x07310107, 0x07310107, 0x15230305, 0x15230305 // Constant + }, GPU_DP_TYPE_16}; + + + gpu_param.global_scale[0] = 4; + gpu_param.global_scale[1] = 1; + x_coord[1] = (uint32_t)(out_shape->data[0]) - 2; + x_coord[0] = ((x_coord[1] - 3) >> 3) * 2; + + status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l00_4x8", &uniResize_x4_nhwc2_l00_4x8); + status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l10_4x8", &uniResize_x4_nhwc2_l10_4x8); + CHECK_STATUS_FAIL_GOTO(status, final ); + } + else + { + VSILOGE("input or output's format is not support"); + status = VSI_FAILURE; + goto final; + } + + gpu_param.global_size[0] = gpu_align_p2((out_height + \ + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); + gpu_param.global_size[1] = 1; + gpu_param.dim = 2; + + status |= vsi_nn_kernel_gpu_add_param( node, "x_coord", &x_coord); + CHECK_STATUS_FAIL_GOTO(status, final ); + + status = vsi_nn_kernel_gpu_config( node, &gpu_param ); +final: + if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr ); + if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); + + return status; +} /* _bilinear_nhwc_bound_initializer() */ + /* * Query kernel */ static vsi_status _query_kernel ( vsi_nn_kernel_t * kernel, - vsi_nn_tensor_t * const * const inputs, - vsi_nn_tensor_t * const * const outputs, - int32_t align_corners, - int32_t half_pixel_centers, - uint32_t up_scale + const uint32_t hashkey, + uint32_t kernel_id ) { + vx_kernel_initialize_f initializer = NULL; + vx_param_description_t * param_def; vsi_status status = VSI_FAILURE; - vsi_nn_kernel_dtype_e in_dtype; - vsi_nn_kernel_dtype_e out_dtype; - const _kernel_map_type * kernel_map = _resize_bilinear_nhwc_kernel_map; - size_t kernel_map_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_map ); - vx_param_description_t * param_def = _resize_bilinear_nhwc_kernel_param_def; - size_t param_def_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def ); - vx_kernel_initialize_f initializer = _resize_bilinear_nhwc_initializer; - uint32_t key; - uint32_t i; + const _kernel_map_type* kernel_map; + size_t kernel_map_size; + size_t param_size; + uint32_t i = 0; - in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); - out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); - - in_dtype = in_dtype == I8 ? U8 : in_dtype; - out_dtype = out_dtype == I8 ? U8 : out_dtype; - - key = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers, align_corners, up_scale ); - for ( i = 0; i < (uint32_t)kernel_map_size; i ++ ) + switch( kernel_id ) { - if ( kernel_map[i].key == key ) + case 0: + initializer = _resize_bilinear_nhwc_initializer; + kernel_map = _resize_bilinear_nhwc_kernel_map; + kernel_map_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_map ); + param_def = _resize_bilinear_nhwc_kernel_param_def; + param_size = _RESIZE_BILINEAR_NHWC_PARAM_NUM; + break; + case 1: + initializer = _bilinear_nhwc_bound_initializer; + kernel_map = _bilinear_nhwc_bound_kernel_map; + kernel_map_size = _cnt_of_array( _bilinear_nhwc_bound_kernel_map ); + param_def = _bilinear_nhwc_bound_kernel_param_def; + param_size = _BILINEAR_NHWC_BOUND_PARAM_NUM; + break; + default: + VSI_ASSERT( FALSE ); + return VSI_FAILURE; + } + + for( i = 0; i < kernel_map_size; i ++ ) + { + if( kernel_map[i].key == hashkey ) { break; } } - - if ( i < kernel_map_size ) + if( i < kernel_map_size ) { snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name ); kernel->info.parameters = param_def; - kernel->info.numParams = (uint32_t)param_def_size; + kernel->info.numParams = (uint32_t)param_size; kernel->info.initialize = initializer; // Register code source vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2, @@ -453,7 +619,8 @@ static vsi_nn_kernel_node_t _setup ) { vsi_status status = VSI_FAILURE; - vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_param_t node0_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL}; + vsi_nn_kernel_node_param_t node1_params[_BILINEAR_NHWC_BOUND_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); @@ -463,8 +630,14 @@ static vsi_nn_kernel_node_t _setup float scale_y = (float)outputs[0]->attr.size[2] / (float)inputs[0]->attr.size[2]; float up_scale = scale_x == scale_y ? scale_x : 0; uint32_t rank = inputs[0]->attr.dim_num; - vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_nn_kernel_t * ikernels[2] = { NULL }; + uint32_t hashkeys[2] = {0}; + uint32_t i = 0; + vsi_nn_tensor_attr_t attr; + vsi_nn_kernel_dtype_e in_dtype; + vsi_nn_kernel_dtype_e out_dtype; if (!is_same_type || depth != 2 || rank < 3 || (up_scale != 2.0f && up_scale != 3.0f && up_scale != 4.0f)) @@ -472,8 +645,24 @@ static vsi_nn_kernel_node_t _setup return NULL; } - status = _query_kernel( kernel, inputs, outputs, - align_corners, half_pixel_centers, (uint32_t)up_scale); + ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + // Assign unique_id + ikernels[0]->unique_id = kernel->unique_id; + ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS ); + // Assign unique_id + ikernels[1]->unique_id = kernel->unique_id; + + in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); + out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + + hashkeys[0] = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers, + align_corners, (vsi_size_t)up_scale ); + hashkeys[1] = BILINEAR_NHWC_BOUND_HASH_KEY( in_dtype, out_dtype, (vsi_size_t)up_scale ); + + status = _query_kernel( ikernels[0], hashkeys[0], 0); + CHECK_STATUS_FAIL_GOTO(status, final ); + status = _query_kernel( kernel, hashkeys[1], 1); + CHECK_STATUS_FAIL_GOTO(status, final ); shapes[0][0] = depth * inputs[0]->attr.size[1]; shapes[0][1] = inputs[0]->attr.size[2]; @@ -491,26 +680,41 @@ static vsi_nn_kernel_node_t _setup reshape_tensors[1] = vsi_nn_reshape_tensor( graph, outputs[0], shapes[1], rank ); - if ( VSI_SUCCESS == status) - { - node = vsi_nn_kernel_create_node( graph, kernel ); - if ( node ) - { - /* Set inputs and outputs */ - vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM, - reshape_tensors, input_num, &reshape_tensors[1], output_num ); - node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); - node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + // resize bilinear + node = vsi_nn_kernel_create_node( graph, ikernels[0] ); + VSI_ASSERT( node != NULL ); + vsi_nn_kernel_node_pack_io( node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM, + reshape_tensors, input_num, &reshape_tensors[1], output_num ); + node0_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners ); + node0_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers ); + status = vsi_nn_kernel_node_pass_param( node, node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM ); + vsi_nn_kernel_scalar_release( &node0_params[SCALAR_ALIGN_CORNERS] ); + vsi_nn_kernel_scalar_release( &node0_params[SCALAR_HALF_PIXEL] ); + vsi_nn_kernel_node_release( &node ); - /* Pass parameters to node. */ - status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] ); - vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] ); + // update bound for output tensor + memcpy( &attr, &(reshape_tensors[1]->attr), sizeof(vsi_nn_tensor_attr_t) ); + attr.size[0] = 1; + attr.size[1] = 1; + attr.dim_num = 2; + reshape_tensors[2] = vsi_nn_CreateTensor( graph, &attr ); + node = vsi_nn_kernel_create_node( graph, kernel ); + VSI_ASSERT( node != NULL ); + vsi_nn_kernel_node_pack_io( node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM, + reshape_tensors, 2, &reshape_tensors[2], 1 ); + status = vsi_nn_kernel_node_pass_param( node, node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM ); + +final: + for( i = 0; i < 2; i ++ ) + { + if( ikernels[i] ) + { + vsi_nn_kernel_release( &ikernels[i] ); } } - vsi_safe_release_tensor(reshape_tensors[0]); vsi_safe_release_tensor(reshape_tensors[1]); + vsi_safe_release_tensor(reshape_tensors[2]); return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c index 2e0cac5..2ccc607 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c @@ -118,7 +118,7 @@ static vsi_status get_scatter_nd_tensor_reshape_size return status; } -#define VSI_NN_MAX_IMAGE_WIDTH (65536) +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH newDim[0] = 0; for(i = 0; i < dims_num; ++i) diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c index c277ba5..957a666 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c @@ -207,7 +207,7 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size return status; } -#define VSI_NN_MAX_IMAGE_WIDTH (65536) +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH newDim[0] = 0; for(i = 0; i < dims_num; ++i) diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c index f5571ab..897f106 100644 --- a/src/tim/vx/internal/src/kernel/evis/select_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c @@ -75,10 +75,24 @@ static const _kernel_map_type _select_kernel_map[] = PACK_KERNEL_MAP(I8, U8, U8, U8), PACK_KERNEL_MAP(I8, I16, I16, I16), PACK_KERNEL_MAP(I8, F16, F16, F16), + PACK_KERNEL_MAP(I8, F16, U8, F16), + PACK_KERNEL_MAP(I8, U8, F16, F16), + PACK_KERNEL_MAP(I8, F16, I8, F16), + PACK_KERNEL_MAP(I8, I8, F16, F16), + PACK_KERNEL_MAP(I8, F16, I16, F16), + PACK_KERNEL_MAP(I8, I16, F16, F16), + PACK_KERNEL_MAP(I8, F16, F16, U8), PACK_KERNEL_MAP_2D(I8, I8, I8, I8), PACK_KERNEL_MAP_2D(I8, U8, U8, U8), PACK_KERNEL_MAP_2D(I8, I16, I16, I16), PACK_KERNEL_MAP_2D(I8, F16, F16, F16), + PACK_KERNEL_MAP_2D(I8, U8, F16, F16), + PACK_KERNEL_MAP_2D(I8, F16, U8, F16), + PACK_KERNEL_MAP_2D(I8, F16, I8, F16), + PACK_KERNEL_MAP_2D(I8, I8, F16, F16), + PACK_KERNEL_MAP_2D(I8, F16, I16, F16), + PACK_KERNEL_MAP_2D(I8, I16, F16, F16), + PACK_KERNEL_MAP_2D(I8, F16, F16, U8), }; /* @@ -142,7 +156,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer) output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); - if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { input0_fl = input0_attr->dfp.fl; if (input0_fl > 0) @@ -154,13 +168,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer) input0Scale = (float)((int64_t)1 << -input0_fl); } } - else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) { input0Scale = input0_attr->asymm.scale; input0Zp = input0_attr->asymm.zero_point; } - if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { input1_fl = input1_attr->dfp.fl; if (input1_fl > 0) @@ -172,13 +186,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer) input1Scale = (float)((int64_t)1 << -input1_fl); } } - else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) { input1Scale = input1_attr->asymm.scale; input1Zp = input1_attr->asymm.zero_point; } - if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) + if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP ) { output_fl = output_attr->dfp.fl; if (output_fl > 0) @@ -190,7 +204,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer) outputScale = (float)((int64_t)1 << -output_fl); } } - else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) + else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM ) { outputScale = output_attr->asymm.scale; outputZP = output_attr->asymm.zero_point; @@ -203,13 +217,10 @@ DEF_KERNEL_INITIALIZER(_select_initializer) output_shape = output_attr->shape; gpu_param.dim = output_shape->size < 3 ? 2 : 3; - gpu_param.global_offset[0] = 0; - gpu_param.global_offset[1] = 0; - gpu_param.global_offset[2] = 0; + gpu_param.global_scale[0] = 8; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; - gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4); gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1) @@ -218,83 +229,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer) (output_shape->data[2] + gpu_param.global_scale[2] - 1) / gpu_param.global_scale[2] : 1; - switch( pack_key ) { - case _PACK_SELECT_KEY( I8, I8, I8 ): - case _PACK_SELECT_KEY( I16, I16, I16 ): - { - gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{ - 0x11111111, // TCfg - 0x00000000, // ASelt - 0x03020100, 0x07060504, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvIntIn0toDst_2x8 = {{ - 0x11111111, // TCfg - 0x00000000, // ASelt - 0x03020100, 0x07060504, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniConvIntIn1toDst_2x8 = {{ - 0x11111111, // TCfg - 0x00000000, // ASelt - 0x03020100, 0x07060504, // ABin - 0x22222222, // BSelt - 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00000001, 0x00000001, 0x00000001, 0x00000001, - 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant - }, GPU_DP_TYPE_16 }; - - if (input0_fl >= output_fl) - { - uint8_t postshift = (uint8_t)gpu_min(input0_fl - output_fl, MAX_POST_SHIFT_BITS); - uniConvIntIn0toDst_2x8.data[7] = uniConvIntIn0toDst_2x8.data[7] | (postshift & 0x1F); - } - else - { - uint32_t idx = 0; - uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input0_fl), MAX_MULTIPLIER_NUM); - for (idx = 8; idx < 16; idx ++) - { - uniConvIntIn0toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff); - } - } - - - if (input1_fl >= output_fl) - { - uint8_t postshift = (uint8_t)gpu_min(input1_fl - output_fl, MAX_POST_SHIFT_BITS); - uniConvIntIn1toDst_2x8.data[7] = uniConvIntIn1toDst_2x8.data[7] | (postshift & 0x1F); - } - else - { - uint32_t idx = 0; - uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input1_fl), MAX_MULTIPLIER_NUM); - for (idx = 8; idx < 16; idx ++) - { - uniConvIntIn1toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff); - } - } - - status = vsi_nn_kernel_gpu_add_param( node, - "uniConvIntIn0toDst_2x8", &uniConvIntIn0toDst_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniConvIntIn1toDst_2x8", &uniConvIntIn1toDst_2x8 ); - status |= vsi_nn_kernel_gpu_add_param( node, - "uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 ); - CHECK_STATUS_FAIL_GOTO(status, final ); - } - break; case _PACK_SELECT_KEY( F16, F16, F16 ): { gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{ @@ -312,61 +248,66 @@ DEF_KERNEL_INITIALIZER(_select_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); } break; + case _PACK_SELECT_KEY( I8, I8, I8 ): + case _PACK_SELECT_KEY( I16, I16, I16 ): case _PACK_SELECT_KEY( U8, U8, U8 ): + case _PACK_SELECT_KEY( I8, F16, F16 ): + case _PACK_SELECT_KEY( U8, F16, F16 ): + case _PACK_SELECT_KEY( I16, F16, F16 ): + case _PACK_SELECT_KEY( F16, U8, F16 ): + case _PACK_SELECT_KEY( F16, I8, F16 ): + case _PACK_SELECT_KEY( F16, I16, F16 ): + case _PACK_SELECT_KEY( F16, F16, U8 ): { - uint32_t idx = 0; - gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In0_2x8 = {{ - 0x99999999, // TCfg - 0x44444444, // ASelt + uint32_t multAndoutZP0[2] = {0}; + uint32_t multAndoutZP1[2] = {0}; + gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{ + 0x11111111, // TCfg + 0x00000000, // ASelt 0x03020100, 0x07060504, // ABin - 0xaaaaaaaa, // BSelt + 0x22222222, // BSelt 0x00000000, 0x00000000, // BBin 0x00000600, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, - 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In1_2x8 = {{ - 0x99999999, // TCfg + gpu_dp_inst_t uniU8MulAndPostShift0_Lo_2x8 = {{ + 0xdddddddd, // TCfg 0x44444444, // ASelt - 0x03020100, 0x07060504, // ABin - 0xaaaaaaaa, // BSelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000600, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, - 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - gpu_dp_inst_t uniU8AddZP_2x8 = {{ - 0x55555555, // TCfg + gpu_dp_inst_t uniU8MulAndPostShift1_Lo_2x8 = {{ + 0xdddddddd, // TCfg 0x44444444, // ASelt - 0x03020100, 0x07060504, // ABin - 0xaaaaaaaa, // BSelt + 0x13121110, 0x17161514, // ABin + 0x11111111, // BSelt 0x00000000, 0x00000000, // BBin - 0x00000400, // AccumType, ConstantType, and PostShift - 0x00010001, 0x00010001, 0x00010001, 0x00010001, - 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant + 0x00002600, // AccumType, ConstantType, and PostShift + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16 }; - uniU8SubZP_MulM_PStoF16In0_2x8.data[7] |= (in0_postShift & 0x1F); - uniU8SubZP_MulM_PStoF16In1_2x8.data[7] |= (in1_postShift & 0x1F); + multAndoutZP0[0] = (uint32_t)(in0_M0); + multAndoutZP0[1] = (uint32_t)((outputZP << in0_postShift) - input0Zp * in0_M0); + multAndoutZP1[0] = (uint32_t)(in1_M0); + multAndoutZP1[1] = (uint32_t)((outputZP << in1_postShift) - input1Zp * in1_M0); - for (idx = 8; idx < 16; idx ++) - { - uniU8SubZP_MulM_PStoF16In0_2x8.data[idx] = (vx_uint32)(in0_M0 << 16) | in0_M0; - uniU8SubZP_MulM_PStoF16In1_2x8.data[idx] = (vx_uint32)(in1_M0 << 16) | in1_M0; - } + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift0_Lo_2x8, in0_postShift ); + gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift1_Lo_2x8, in1_postShift ); - status = vsi_nn_kernel_gpu_add_param( node, - "uniU8SubZP_MulM_PStoF16In0_2x8", &uniU8SubZP_MulM_PStoF16In0_2x8 ); + status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 ); + status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 ); status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8SubZP_MulM_PStoF16In1_2x8", &uniU8SubZP_MulM_PStoF16In1_2x8 ); + "uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, - "uniU8AddZP_2x8", &uniU8AddZP_2x8 ); + "uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift0_Lo_2x8 ); status |= vsi_nn_kernel_gpu_add_param( node, - "input0Zp", &input0Zp ); - status |= vsi_nn_kernel_gpu_add_param( node, - "input1Zp", &input1Zp ); - status |= vsi_nn_kernel_gpu_add_param( node, - "outputZP", &outputZP ); + "uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift1_Lo_2x8 ); CHECK_STATUS_FAIL_GOTO(status, final ); } break; @@ -501,4 +442,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( select, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c index b9f570b..8839470 100644 --- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c @@ -39,7 +39,6 @@ __BEGIN_DECLS -#define _SLICE_KERNEL_SOURCE "slice" #define _SLICE_KERNEL_NAME CVIVANTE_NAMESPACE("evis.slice") // Add kernel hashtable here @@ -50,30 +49,30 @@ __BEGIN_DECLS #define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D, _SAMEFL) \ (( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D << 1) | (_SAMEFL)) -#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +#define PACK_KERNEL_MAP_3D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ { SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \ - SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" } #define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D") -#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ { SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \ - SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" } #define SLICE_SH_KERNEL_SAMEFL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL") -#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ { SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \ - SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" } #define SLICE_SH_KERNEL_SAMEFL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \ CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL_2D") -#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \ +#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \ { SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ), \ - SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE } + SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" } typedef struct { @@ -85,21 +84,33 @@ __BEGIN_DECLS static const _kernel_map_type _slice_kernel_map[] = { // Register kernel here - PACK_KERNEL_MAP( F16, I32, F16, _SLICE_KERNEL_SOURCE ), - PACK_KERNEL_MAP( I16, I32, I16, _SLICE_KERNEL_SOURCE ), - PACK_KERNEL_MAP( U8, I32, U8, _SLICE_KERNEL_SOURCE ), - PACK_KERNEL_MAP( I8, I32, I8, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_3D( F16, I32, F16 ), + PACK_KERNEL_MAP_3D( F16, I32, I8 ), + PACK_KERNEL_MAP_3D( F16, I32, U8 ), + PACK_KERNEL_MAP_3D( F16, I32, I16 ), + PACK_KERNEL_MAP_3D( I8, I32, F16 ), + PACK_KERNEL_MAP_3D( U8, I32, F16 ), + PACK_KERNEL_MAP_3D( I16, I32, F16 ), + PACK_KERNEL_MAP_3D( I16, I32, I16 ), + PACK_KERNEL_MAP_3D( U8, I32, U8 ), + PACK_KERNEL_MAP_3D( I8, I32, I8 ), - PACK_KERNEL_MAP_2D( F16, I32, F16, _SLICE_KERNEL_SOURCE ), - PACK_KERNEL_MAP_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ), - PACK_KERNEL_MAP_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ), - PACK_KERNEL_MAP_2D( I8, I32, I8, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_2D( F16, I32, F16 ), + PACK_KERNEL_MAP_2D( I16, I32, I16 ), + PACK_KERNEL_MAP_2D( F16, I32, I8 ), + PACK_KERNEL_MAP_2D( F16, I32, U8 ), + PACK_KERNEL_MAP_2D( F16, I32, I16 ), + PACK_KERNEL_MAP_2D( I8, I32, F16 ), + PACK_KERNEL_MAP_2D( U8, I32, F16 ), + PACK_KERNEL_MAP_2D( I16, I32, F16 ), + PACK_KERNEL_MAP_2D( U8, I32, U8 ), + PACK_KERNEL_MAP_2D( I8, I32, I8 ), - PACK_KERNEL_MAP_SAMEFL( I16, I32, I16, _SLICE_KERNEL_SOURCE ), - PACK_KERNEL_MAP_SAMEFL( U8, I32, U8, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_SAMEFL( I16, I32, I16 ), + PACK_KERNEL_MAP_SAMEFL( U8, I32, U8 ), - PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ), - PACK_KERNEL_MAP_SAMEFL_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ), + PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16 ), + PACK_KERNEL_MAP_SAMEFL_2D( U8, I32, U8 ), }; #define _INPUT_NUM (2) @@ -201,18 +212,16 @@ DEF_KERNEL_INITIALIZER(_slice_initializer) scaleOut = output_attr->asymm.scale; } - if ((F16 == input_dtype) - || (I16 == input_dtype) - || (BF16 == input_dtype) - ) + if ((I8 == input_dtype && input_dtype == output_dtype ) || + (U8 == input_dtype && input_dtype == output_dtype ) ) { - gpu_param.global_scale[0] = 8; + gpu_param.global_scale[0] = 16; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; } else { - gpu_param.global_scale[0] = 16; + gpu_param.global_scale[0] = 8; gpu_param.global_scale[1] = 1; gpu_param.global_scale[2] = 1; } diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index fea09ff..e9a9272 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -1416,31 +1416,42 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create switch( attr->quant ) { case VSI_NN_KERNEL_QUANT_DFP: - { + { int8_t fl = 0; status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_FIXED_POINT_POS, &fl, sizeof(int8_t)); CHECK_STATUS( status ); attr->dfp.fl = (int32_t)fl; + if (fl >= 0) { + attr->scale = 1.0f / ((float)((int64_t)1 << fl)); + } else { + attr->scale = (float)((int64_t)1 << -fl); } - break; + } break; case VSI_NN_KERNEL_QUANT_ASYMM: - { - status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_ZERO_POINT, - &(attr->asymm.zero_point), sizeof(int32_t)); - CHECK_STATUS( status ); - status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_SCALE, - &(attr->asymm.scale), sizeof(float)); - CHECK_STATUS( status ); + { + status = vxQueryTensor((vx_tensor)tensor, + VX_TENSOR_ZERO_POINT, + &(attr->asymm.zero_point), + sizeof(int32_t)); + CHECK_STATUS(status); + status = vxQueryTensor((vx_tensor)tensor, + VX_TENSOR_SCALE, + &(attr->asymm.scale), + sizeof(float)); + CHECK_STATUS(status); // Reset scale to 1e-8 - if( (attr->asymm.scale - 0.f) < 1e-8 ) - { + if ((attr->asymm.scale - 0.f) < 1e-8) + { attr->asymm.scale = (float)1e-8; attr->asymm.zero_point = 0; - } } - break; + attr->scale = attr->asymm.scale; + attr->zero_point = attr->asymm.zero_point; + } + break; default: + attr->scale = 1.0f; break; } return attr; diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c index 80d56d0..dd32c01 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c @@ -189,6 +189,16 @@ static float celu_eval(float x, vsi_nn_kernel_lut_params *lut_param) return positive + negative; } +static float rcp_eval(float x) +{ + return 1.0f / x; +} + +static float softsign_eval(float x) +{ + return x / (1 + vsi_abs(x)); +} + static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param) { float result = 0; @@ -245,6 +255,12 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params * case VSI_NN_KERNEL_LUT_CELU: result = celu_eval(data, lut_param); break; + case VSI_NN_KERNEL_LUT_RCP: + result = rcp_eval(data); + break; + case VSI_NN_KERNEL_LUT_SOFTSIGN: + result = softsign_eval(data); + break; default: VSILOGE( "unsupported activation function:%d", lut_param->act_type ); break; diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index bdb2240..d27a5f6 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -133,5 +133,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu) REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu) REGISTER_VX_FIRST_KERNEL_SELECTOR(matrixmul) REGISTER_VX_FIRST_KERNEL_SELECTOR(celu) +REGISTER_VX_FIRST_KERNEL_SELECTOR(rcp) +REGISTER_VX_FIRST_KERNEL_SELECTOR(softsign) +REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_bilinear) +REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_nearest) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c index 0ab544b..c6edaaa 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -146,6 +146,8 @@ REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf, VSI_NN_KERNEL_LUT_ERF ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras, VSI_NN_KERNEL_LUT_RELU_KERAS ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip, VSI_NN_KERNEL_LUT_CLIP ) REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( celu, VSI_NN_KERNEL_LUT_CELU ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( rcp, VSI_NN_KERNEL_LUT_RCP ) +REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( softsign, VSI_NN_KERNEL_LUT_SOFTSIGN ) #undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/resize_vx.c b/src/tim/vx/internal/src/kernel/vx/resize_vx.c new file mode 100644 index 0000000..3b2b167 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/resize_vx.c @@ -0,0 +1,152 @@ +/**************************************************************************** +* +* Copyright (c) 2021 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" + +#define REGISTER_SOFTMAX_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel + ) +{ + vx_node node = NULL; + int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); + int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); + int32_t type = vsi_nn_kernel_param_get_int32( params, "type" ); + +#ifdef VX_SCALE_EXTRA_PARAMETER_SUPPORT + vx_nn_scale_params_ext_t param; + param.align_corners = align_corners; + param.half_pixel_centers = half_pixel_centers; + switch (type) + { + case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR: + param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR; + break; + case VSI_NN_INTERPOLATION_BILINEAR: + param.base.type = VX_INTERPOLATION_BILINEAR; + break; + case VSI_NN_INTERPOLATION_AREA: + param.base.type = VX_INTERPOLATION_AREA; + break; + default: + param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR; + } + node = vxTensorScaleNode( graph->g, + inputs[0]->t, + (vx_nn_scale_params)(¶m), + sizeof(vx_nn_scale_params_ext_t), + outputs[0]->t ); +#else + vx_nn_scale_params_t param; + if (align_corners || half_pixel_centers) + { + return NULL; + } + switch (type) + { + case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR: + param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR; + break; + case VSI_NN_INTERPOLATION_BILINEAR: + param.type = VX_INTERPOLATION_BILINEAR; + break; + case VSI_NN_INTERPOLATION_AREA: + param.type = VX_INTERPOLATION_AREA; + break; + default: + param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR; + break; + } + + node = vxTensorScaleNode( graph->g, + inputs[0]->t, + ¶m, + sizeof(param), + outputs[0]->t ); +#endif + if ( NULL == node ) + { + VSILOGI("Call vxTensorScaleNode fail.(resize)"); + } + + return (vsi_nn_kernel_node_t)node; +} /* _setup() */ + +#define REGISTER_RESIZE_OPENVX_KERNEL(KERNEL_NAME) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel); \ + } \ + REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +REGISTER_RESIZE_OPENVX_KERNEL( resize_nearest ) +REGISTER_RESIZE_OPENVX_KERNEL( resize_bilinear ) + +#undef REGISTER_RESIZE_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/kernel/vx/square_vx.c b/src/tim/vx/internal/src/kernel/vx/square_vx.c index 572737c..5ae1499 100644 --- a/src/tim/vx/internal/src/kernel/vx/square_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/square_vx.c @@ -32,7 +32,6 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_error.h" #include "kernel/vsi_nn_kernel.h" -#include "kernel/vsi_nn_kernel_lut.h" static vsi_nn_kernel_node_t _setup ( @@ -46,57 +45,7 @@ static vsi_nn_kernel_node_t _setup ) { vx_node node = NULL; -#ifdef VX_USER_LOOKUP_TABLE_SUPPORT - vx_lut lut1 = NULL; - vx_lut lut2 = NULL; - vsi_status status = VSI_FAILURE; - vsi_nn_kernel_lut_params lut_param; - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) - { - return NULL; - } - - lut_param.act_type = VSI_NN_KERNEL_LUT_SQUARE; - - lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE); - lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE); - if( NULL == lut1 || NULL == lut2 ) - { - VSILOGE("create lut object fail."); - goto final; - } - - status = vsi_nn_kernel_lut(lut1, lut2, &lut_param); - CHECK_STATUS_FAIL_GOTO(status, final); - - node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); - if ( NULL == node ) - { - node = vxActivationLayer( - graph->g, - inputs[0]->t, - VX_NN_ACTIVATION_SQUARE, - 0, - 0, - outputs[0]->t - ); - } - -final: - if (lut1) - { - vxReleaseLUT(&lut1); - lut1 = NULL; - } - if (lut2) - { - vxReleaseLUT(&lut2); - lut2 = NULL; - } - return (vsi_nn_kernel_node_t)node; -#else node = vxActivationLayer( graph->g, inputs[0]->t, @@ -107,7 +56,6 @@ final: ); return (vsi_nn_kernel_node_t)node; -#endif } /* _setup() */ #define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME) \ diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl new file mode 100644 index 0000000..0372981 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum.cl @@ -0,0 +1,478 @@ +__kernel void cumsum_F32toF32_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int channel, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + float4 sum = (float4)(0); + + if(exclusive && rev) + { + coord_out.z = channel - 1; + write_imagef(output, coord_out, sum); + + for(coord.z = channel - 1; coord.z > 0; coord.z--) + { + float4 data = read_imagef(input, coord); + coord_out.z--; + sum += data; + + write_imagef(output, coord_out, sum); + } + } + else if(exclusive) + { + coord_out.z = 0; + write_imagef(output, coord_out, sum); + for(coord.z = 0; coord.z < channel - 1; coord.z++) + { + float4 data = read_imagef(input, coord); + coord_out.z++; + sum += data; + + write_imagef(output, coord_out, sum); + } + } + else if(rev) + { + for(coord.z = channel - 1; coord.z >= 0; coord.z--) + { + float4 data = read_imagef(input, coord); + sum += data; + + write_imagef(output, coord, sum); + } + } + else + { + for(coord.z = 0; coord.z < channel; coord.z++) + { + float4 data = read_imagef(input, coord); + sum += data; + + write_imagef(output, coord, sum); + } + } +} + +__kernel void cumsum_U8toU8_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int channel, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + uint4 sum = (uint4)(0); + uint4 dst = (uint4)(0); + + float cnt = 0.0f; + + if(exclusive && rev) + { + coord_out.z = channel - 1; + write_imageui(output, coord_out, dst); + for(coord.z = channel - 1; coord.z > 0; coord.z--) + { + uint4 data = read_imageui(input, coord); + coord_out.z--; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord_out, dst); + } + } + else if(exclusive) + { + coord_out.z = 0; + write_imageui(output, coord_out, dst); + for(coord.z = 0; coord.z < channel - 1; coord.z++) + { + uint4 data = read_imageui(input, coord); + coord_out.z++; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord_out, dst); + } + } + else if(rev) + { + for(coord.z = channel - 1; coord.z >= 0; coord.z--) + { + uint4 data = read_imageui(input, coord); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord, dst); + } + } + else + { + for(coord.z = 0; coord.z < channel; coord.z++) + { + uint4 data = read_imageui(input, coord); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord, dst); + } + } +} + +__kernel void cumsum_F32toF32_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int channel, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + float4 sum = (float4)(0); + + if(exclusive && rev) + { + coord_out.y = height - 1; + write_imagef(output, coord_out, sum); + for(coord.y = height - 1; coord.y > 0; coord.y--) + { + float4 data = read_imagef(input, coord); + coord_out.y--; + sum += data; + + write_imagef(output, coord_out, sum); + } + } + else if(exclusive) + { + coord_out.y = 0; + write_imagef(output, coord_out, sum); + for(coord.y = 0; coord.y < height - 1; coord.y++) + { + float4 data = read_imagef(input, coord); + coord_out.y++; + sum += data; + + write_imagef(output, coord_out, sum); + } + } + else if(rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + float4 data = read_imagef(input, coord); + sum += data; + + write_imagef(output, coord, sum); + } + } + else + { + for(coord.y = 0; coord.y < height; coord.y++) + { + float4 data = read_imagef(input, coord); + sum += data; + + write_imagef(output, coord, sum); + } + } +} + +__kernel void cumsum_U8toU8_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int channel, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + uint4 sum = (uint4)(0); + uint4 dst = (uint4)(0); + + float cnt = 0; + + if(exclusive && rev) + { + coord_out.y = height - 1; + write_imageui(output, coord_out, dst); + + for(coord.y = height - 1; coord.y > 0; coord.y--) + { + uint4 data = read_imageui(input, coord); + cnt += 1.0f; + coord_out.y--; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord_out, dst); + } + } + else if(exclusive) + { + coord_out.y = 0; + write_imageui(output, coord_out, dst); + for(coord.y = 0; coord.y < height - 1; coord.y++) + { + uint4 data = read_imageui(input, coord); + cnt += 1.0f; + coord_out.y++; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord_out, dst); + } + } + else if(rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + uint4 data = read_imageui(input, coord); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord, dst); + } + } + else + { + for(coord.y = 0; coord.y < height; coord.y++) + { + uint4 data = read_imageui(input, coord); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord, dst); + } + } +} + +__kernel void cumsum_F32toF32_axis0( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int channel, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + float4 sum = (float4)(0); + + if(exclusive && rev) + { + coord_out.x = width - 1; + write_imagef(output, coord_out, sum); + for(coord.x = width - 1; coord.x > 0; coord.x--) + { + float4 data = read_imagef(input, coord); + coord_out.x--; + sum += data; + + write_imagef(output, coord_out, sum); + } + } + else if(exclusive) + { + coord_out.x = 0; + write_imagef(output, coord_out, sum); + for(coord.x = 0; coord.x < width - 1; coord.x++) + { + float4 data = read_imagef(input, coord); + coord_out.x++; + sum += data; + + write_imagef(output, coord_out, sum); + } + } + else if(rev) + { + for(coord.x = width - 1; coord.x >= 0; coord.x--) + { + float4 data = read_imagef(input, coord); + sum += data; + + write_imagef(output, coord, sum); + } + } + else + { + for(coord.x = 0; coord.x < width; coord.x++) + { + float4 data = read_imagef(input, coord); + sum += data; + + write_imagef(output, coord, sum); + } + } +} + +__kernel void cumsum_U8toU8_axis0( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int channel, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = coord; + + uint4 sum = (uint4)(0); + uint4 dst = (uint4)(0); + + float cnt = 0; + + if(exclusive && rev) + { + coord_out.x = width - 1; + write_imageui(output, coord_out, dst); + for(coord.x = width - 1; coord.x > 0; coord.x--) + { + uint4 data = read_imageui(input, coord); + coord_out.x--; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord_out, dst); + } + } + else if(exclusive) + { + coord_out.x = 0; + write_imageui(output, coord_out, dst); + for(coord.x = 0; coord.x < width - 1; coord.x++) + { + uint4 data = read_imageui(input, coord); + coord_out.x++; + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord_out, dst); + } + } + else if(rev) + { + for(coord.x = width - 1; coord.x >= 0; coord.x--) + { + uint4 data = read_imageui(input, coord); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord, dst); + } + } + else + { + for(coord.x = 0; coord.x < width; coord.x++) + { + uint4 data = read_imageui(input, coord); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord, dst); + } + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl new file mode 100644 index 0000000..caced34 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_2d.cl @@ -0,0 +1,314 @@ + +__kernel void cumsum_F32toF32_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + float4 sum = (float4)(0); + + if(exclusive && rev) + { + coord.w = height - 1; + write_imagef(output, coord.zw, sum); + for(coord.y = height - 1; coord.y > 0; coord.y--) + { + float4 data = read_imagef(input, coord.xy); + coord.w--; + sum += data; + + write_imagef(output, coord.zw, sum); + } + } + else if(exclusive) + { + write_imagef(output, coord.zw, sum); + for(coord.y = 0; coord.y < height - 1; coord.y++) + { + float4 data = read_imagef(input, coord.xy); + coord.w++; + sum += data; + + write_imagef(output, coord.zw, sum); + } + } + else if(rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + float4 data = read_imagef(input, coord.xy); + sum += data; + + write_imagef(output, coord.xy, sum); + } + } + else + { + for(coord.y = 0; coord.y < height; coord.y++) + { + float4 data = read_imagef(input, coord.xy); + sum += data; + + write_imagef(output, coord.xy, sum); + } + } +} + +__kernel void cumsum_U8toU8_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + uint4 sum = (uint4)(0); + uint4 dst = (uint4)(0); + + float cnt = 0; + + if(exclusive && rev) + { + coord.w = height - 1; + write_imageui(output, coord.zw, sum); + for(coord.y = height - 1; coord.y > 0; coord.y--) + { + uint4 data = read_imageui(input, coord.xy); + cnt += 1.0f; + coord.w--; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord.zw, dst); + } + } + else if(exclusive) + { + write_imageui(output, coord.zw, sum); + for(coord.y = 0; coord.y < height - 1; coord.y++) + { + uint4 data = read_imageui(input, coord.xy); + cnt += 1.0f; + coord.w++; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord.zw, dst); + } + } + else if(rev) + { + for(coord.y = height - 1; coord.y >= 0; coord.y--) + { + uint4 data = read_imageui(input, coord.xy); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord.xy, dst); + } + } + else + { + for(coord.y = 0; coord.y < height; coord.y++) + { + uint4 data = read_imageui(input, coord.xy); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord.xy, dst); + } + } +} + +__kernel void cumsum_F32toF32_axis0_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + float4 sum = (float4)(0); + + if(exclusive && rev) + { + coord.x = width - 1; + coord.z = coord.x; + write_imagef(output, coord.zw, sum); + for(; coord.x > 0; coord.x--) + { + float4 data = read_imagef(input, coord.xy); + coord.z--; + sum += data; + + write_imagef(output, coord.zw, sum); + } + } + else if(exclusive) + { + coord.z = 0; + write_imagef(output, coord.zw, sum); + for(coord.x = 0; coord.x < width - 1; coord.x++) + { + float4 data = read_imagef(input, coord.xy); + coord.z++; + sum += data; + + write_imagef(output, coord.zw, sum); + } + } + else if(rev) + { + for(coord.x = width - 1; coord.x >= 0; coord.x--) + { + float4 data = read_imagef(input, coord.xy); + sum += data; + + write_imagef(output, coord.xy, sum); + } + } + else + { + for(coord.x = 0; coord.x < width; coord.x++) + { + float4 data = read_imagef(input, coord.xy); + sum += data; + + write_imagef(output, coord.xy, sum); + } + } +} + +__kernel void cumsum_U8toU8_axis0_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, + int exclusive, + int rev, + int width, + int height, + int chn, + int input_zp, + float in_out_scale, + float in_out_zp_scale, + float output_zp + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + uint4 sum = (uint4)(0); + uint4 dst = (uint4)(0); + + float cnt = 0.0f; + + if(exclusive && rev) + { + coord.x = width - 1; + coord.z = coord.x; + write_imageui(output, coord.zw, sum); + for(; coord.x > 0; coord.x--) + { + uint4 data = read_imageui(input, coord.xy); + coord.z--; + cnt += 1.0; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord.zw, dst); + } + } + else if(exclusive) + { + coord.z = 0; + write_imageui(output, coord.zw, sum); + for(coord.x = 0; coord.x < width - 1; coord.x++) + { + uint4 data = read_imageui(input, coord.xy); + cnt += 1.0f; + coord.z++; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord.zw, dst); + } + } + else if(rev) + { + for(coord.x = width - 1; coord.x >= 0; coord.x--) + { + uint4 data = read_imageui(input, coord.xy); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord.xy, dst); + } + } + else + { + for(coord.x = 0; coord.x < width; coord.x++) + { + uint4 data = read_imageui(input, coord.xy); + cnt += 1.0f; + sum += data; + + float tmpAlpha = cnt * in_out_zp_scale + output_zp; + float tmpSum = sum.x * in_out_scale + tmpAlpha; + + dst.x = (uint)convert_int_rte(tmpSum); + write_imageui(output, coord.xy, dst); + } + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl index c991ffc..65be20e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl @@ -136,6 +136,21 @@ float eltwise_unary_celu(float val, float alpha, float rcp_alpha) return val < 0 ? x : val; } +float eltwise_unary_rcp(float val, float alpha, float rcp_alpha) +{ + return 1.0f / val; +} + +float eltwise_unary_sign(float val, float alpha, float rcp_alpha) +{ + return sign(val); +} + +float eltwise_unary_softsign(float val, float alpha, float rcp_alpha) +{ + return val / (1.0f + fabs(val)); +} + #define ELTWISE_UNARY_F32_2D(func_name) \ __kernel void func_name##_F32toF32_2D \ ( \ @@ -170,6 +185,9 @@ ELTWISE_UNARY_F32_2D(gelu) ELTWISE_UNARY_F32_2D(hard_gelu) ELTWISE_UNARY_F32_2D(selu) ELTWISE_UNARY_F32_2D(celu) +ELTWISE_UNARY_F32_2D(rcp) +ELTWISE_UNARY_F32_2D(sign) +ELTWISE_UNARY_F32_2D(softsign) #define ELTWISE_UNARY_U8_2D(func_name) \ __kernel void func_name##_U8toU8_2D \ @@ -206,6 +224,9 @@ ELTWISE_UNARY_U8_2D(gelu) ELTWISE_UNARY_U8_2D(hard_gelu) ELTWISE_UNARY_U8_2D(selu) ELTWISE_UNARY_U8_2D(celu) +ELTWISE_UNARY_U8_2D(rcp) +ELTWISE_UNARY_U8_2D(sign) +ELTWISE_UNARY_U8_2D(softsign) __kernel void neg_I32toI32_2D ( diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl index 20cc454..5a21ad8 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl @@ -136,6 +136,21 @@ float eltwise_unary_celu(float val, float alpha, float rcp_alpha) return val < 0 ? x : val; } +float eltwise_unary_rcp(float val, float alpha, float rcp_alpha) +{ + return 1.0f / val; +} + +float eltwise_unary_sign(float val, float alpha, float rcp_alpha) +{ + return sign(val); +} + +float eltwise_unary_softsign(float val, float alpha, float rcp_alpha) +{ + return val / (1.0f + fabs(val)); +} + #define ELTWISE_UNARY_F32(func_name) \ __kernel void func_name##_F32toF32 \ ( \ @@ -170,6 +185,9 @@ ELTWISE_UNARY_F32(gelu) ELTWISE_UNARY_F32(hard_gelu) ELTWISE_UNARY_F32(selu) ELTWISE_UNARY_F32(celu) +ELTWISE_UNARY_F32(rcp) +ELTWISE_UNARY_F32(sign) +ELTWISE_UNARY_F32(softsign) #define ELTWISE_UNARY_U8(func_name) \ __kernel void func_name##_U8toU8 \ @@ -206,6 +224,9 @@ ELTWISE_UNARY_U8(gelu) ELTWISE_UNARY_U8(hard_gelu) ELTWISE_UNARY_U8(selu) ELTWISE_UNARY_U8(celu) +ELTWISE_UNARY_U8(rcp) +ELTWISE_UNARY_U8(sign) +ELTWISE_UNARY_U8(softsign) __kernel void neg_I32toI32 ( diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f16.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f16.cl deleted file mode 100644 index f05e01d..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f16.cl +++ /dev/null @@ -1,229 +0,0 @@ -__kernel void instance_norm_meanvari_F16( - __read_only image2d_array_t input, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int width, - int height - ) -{ - int gidx = get_global_id(0); - int gidz = get_global_id(1); - int lidx = get_local_id(0); - - int4 coord = (int4)(gidx, 0, gidz, 0); - float4 data; - float sum = 0, sqr = 0; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - data = read_imagef(input, coord); - coord.y++; - sum += data.x; - sqr += data.x * data.x; - } - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 dst = (float4)(0); - dst.x = sum; - write_imagef(output, coord_out.xy, dst); - coord_out.x++; - dst.x = sqr; - write_imagef(output, coord_out.xy, dst); - } -} - -__kernel void instance_norm_meanvari_F16_2D( - __read_only image2d_t input, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int width, - int height - ) -{ - int gidx = get_global_id(0); - int gidz = get_global_id(1); - int lidx = get_local_id(0); - int gidy = gidz * height; - - int2 coord = (int2)(gidx, gidy); - float4 data; - float sum = 0, sqr = 0; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int endH = gidy + height; - if(gidx < width) - { - for(; coord.y < endH;) - { - data = read_imagef(input, coord); - coord.y++; - sum += data.x; - sqr += data.x * data.x; - } - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 dst = (float4)(0); - dst.x = sum; - write_imagef(output, coord_out.xy, dst); - coord_out.x++; - dst.x = sqr; - write_imagef(output, coord_out.xy, dst); - } -} - -__kernel void instance_norm_F16toF16( - __read_only image2d_array_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_array_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num - ) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, 0); - int4 coord_para = (int4)(0, gidz, 0, 0); - - float4 gamma = read_imagef(scale, coord_para.yx); - float4 beta = read_imagef(bias, coord_para.yx); - float4 mean_vari = (float4)(0); - float scale_vari, bias_val; - - for(int i = 0; i < group_num; i++) - { - mean_vari.x += read_imagef(meanVari, coord_para.xy).x; - coord_para.x++; - mean_vari.y += read_imagef(meanVari, coord_para.xy).x; - coord_para.x+=3; - } - mean_vari *= dim_ratio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = gamma.s0 * mean_vari.s1; - bias_val = (beta.s0 - scale_vari * mean_vari.s0); - - float4 data, dst; - for(coord.y = 0; coord.y < height;coord.y++) - { - data = read_imagef(input, coord); - - dst.x = data.x * scale_vari + bias_val; - write_imagef(output, coord, dst); - } -} - -__kernel void instance_norm_F16toF16_2D( - __read_only image2d_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num - ) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int2 coord = (int2)(get_global_id(0), gidy); - int2 coord_para = (int2)(0, gidz); - int endH = gidy + height; - - float4 gamma = read_imagef(scale, coord_para.yx); - float4 beta = read_imagef(bias, coord_para.yx); - float4 mean_vari = (float4)(0); - float scale_vari, bias_val; - - for(int i = 0; i < group_num; i++) - { - mean_vari.x += read_imagef(meanVari, coord_para.xy).x; - coord_para.x++; - mean_vari.y += read_imagef(meanVari, coord_para.xy).x; - coord_para.x+=3; - } - mean_vari *= dim_ratio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = gamma.s0 * mean_vari.s1; - bias_val = (beta.s0 - scale_vari * mean_vari.s0); - - float4 data, dst; - for(; coord.y < endH; coord.y++) - { - data = read_imagef(input, coord); - - dst.x = data.x * scale_vari + bias_val; - write_imagef(output, coord, dst); - } -} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl index 5946570..85d7f98 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_f32.cl @@ -1,13 +1,10 @@ -__kernel void instance_norm_meanvari_F32( - __read_only image2d_array_t input, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int width, - int height +__kernel void instance_norm_sums_F32( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int width, + int height ) { int gidx = get_global_id(0); @@ -27,8 +24,8 @@ __kernel void instance_norm_meanvari_F32( { data = read_imagef(input, coord); coord.y++; - sum += data.x; - sqr += data.x * data.x; + sum = sum + data.x; + sqr = sqr + data.x * data.x; } } lcl_sum[lidx] = sum; @@ -58,16 +55,13 @@ __kernel void instance_norm_meanvari_F32( } } -__kernel void instance_norm_meanvari_F32_2D( - __read_only image2d_t input, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int width, - int height +__kernel void instance_norm_sums_F32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int width, + int height ) { int gidx = get_global_id(0); @@ -89,8 +83,8 @@ __kernel void instance_norm_meanvari_F32_2D( { data = read_imagef(input, coord); coord.y++; - sum += data.x; - sqr += data.x * data.x; + sum = sum + data.x; + sqr = sqr + data.x * data.x; } } lcl_sum[lidx] = sum; @@ -121,23 +115,19 @@ __kernel void instance_norm_meanvari_F32_2D( } __kernel void instance_norm_F32toF32( - __read_only image2d_array_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_array_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -156,7 +146,7 @@ __kernel void instance_norm_F32toF32( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); @@ -174,23 +164,19 @@ __kernel void instance_norm_F32toF32( } __kernel void instance_norm_F32toF32_2D( - __read_only image2d_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -211,12 +197,12 @@ __kernel void instance_norm_F32toF32_2D( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = gamma.s0 * mean_vari.s1; - bias_val = beta.s0 - scale_vari * mean_vari.s0; + bias_val = (beta.s0 - scale_vari * mean_vari.s0); float4 data, dst; for(; coord.y < endH; coord.y++) diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl index 3928749..12b6243 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_i32.cl @@ -1,13 +1,10 @@ -__kernel void instance_norm_meanvari_I32( - __read_only image2d_array_t input, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int width, - int height +__kernel void instance_norm_sums_I32( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int width, + int height ) { int gidx = get_global_id(0); @@ -16,9 +13,8 @@ __kernel void instance_norm_meanvari_I32( int4 coord = (int4)(gidx, 0, gidz, 0); int4 data; - float sum = 0, sqr = 0; - int tmpSum = 0; - float e2InScale = input_fl * input_fl; + float2 sum_x_x2 = 0; + int2 _sum_x_x2 = 0; __local float lcl_sum[16]; __local float lcl_sqr[16]; @@ -29,13 +25,13 @@ __kernel void instance_norm_meanvari_I32( { data = read_imagei(input, coord); coord.y++; - tmpSum += data.x; - sqr += (data.x * data.x * e2InScale); + _sum_x_x2.x = _sum_x_x2.x + data.x; + _sum_x_x2.y = _sum_x_x2.y + data.x * data.x; } - sum = tmpSum * input_fl; + sum_x_x2 = convert_float2(_sum_x_x2); } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; + lcl_sum[lidx] = sum_x_x2.x; + lcl_sqr[lidx] = sum_x_x2.y; barrier(CLK_LOCAL_MEM_FENCE); int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); @@ -45,7 +41,7 @@ __kernel void instance_norm_meanvari_I32( __local float4* tmp_sum = (__local float4*)lcl_sum; __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; + float sum = 0, sqr = 0; for(int i = 0; i < 4; i++) { sum += dot(tmp_sum[i], one); @@ -61,16 +57,13 @@ __kernel void instance_norm_meanvari_I32( } } -__kernel void instance_norm_meanvari_I32_2D( - __read_only image2d_t input, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int width, - int height +__kernel void instance_norm_sums_I32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int width, + int height ) { int gidx = get_global_id(0); @@ -80,9 +73,8 @@ __kernel void instance_norm_meanvari_I32_2D( int2 coord = (int2)(gidx, gidy); int4 data; - float sum = 0, sqr = 0; - int tmpSum = 0; - float e2InScale = input_fl * input_fl; + float2 sum_x_x2 = 0; + int2 _sum_x_x2 = 0; __local float lcl_sum[16]; __local float lcl_sqr[16]; @@ -94,13 +86,13 @@ __kernel void instance_norm_meanvari_I32_2D( { data = read_imagei(input, coord); coord.y++; - tmpSum += data.x; - sqr += (data.x * data.x * e2InScale); + _sum_x_x2.x = _sum_x_x2.x + data.x; + _sum_x_x2.y = _sum_x_x2.y + data.x * data.x; } - sum = tmpSum * input_fl; + sum_x_x2 = convert_float2(_sum_x_x2); } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; + lcl_sum[lidx] = sum_x_x2.x; + lcl_sqr[lidx] = sum_x_x2.y; barrier(CLK_LOCAL_MEM_FENCE); int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); @@ -110,7 +102,7 @@ __kernel void instance_norm_meanvari_I32_2D( __local float4* tmp_sum = (__local float4*)lcl_sum; __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; + float sum = 0, sqr = 0; for(int i = 0; i < 4; i++) { sum += dot(tmp_sum[i], one); @@ -127,23 +119,19 @@ __kernel void instance_norm_meanvari_I32_2D( } __kernel void instance_norm_I32toI32( - __read_only image2d_array_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_array_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -162,13 +150,13 @@ __kernel void instance_norm_I32toI32( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = gamma.s0 * mean_vari.s1; - float alpha = input_fl * output_fl * scale_vari; - bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl; + float alpha = output_scale * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; int4 data, dst; for(coord.y = 0; coord.y < height;coord.y++) @@ -183,23 +171,19 @@ __kernel void instance_norm_I32toI32( } __kernel void instance_norm_I32toI32_2D( - __read_only image2d_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -220,13 +204,13 @@ __kernel void instance_norm_I32toI32_2D( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = gamma.s0 * mean_vari.s1; - float alpha = input_fl * output_fl * scale_vari; - bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl; + float alpha = output_scale * scale_vari; + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; int4 data, dst; for(; coord.y < endH; coord.y++) @@ -241,23 +225,19 @@ __kernel void instance_norm_I32toI32_2D( } __kernel void instance_norm_I32toF32( - __read_only image2d_array_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_array_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -276,12 +256,12 @@ __kernel void instance_norm_I32toF32( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = gamma.s0 * mean_vari.s1; - float alpha = input_fl * scale_vari; + float alpha = scale_vari; bias_val = (beta.s0 - scale_vari * mean_vari.s0); int4 data; @@ -296,23 +276,19 @@ __kernel void instance_norm_I32toF32( } __kernel void instance_norm_I32toF32_2D( - __read_only image2d_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -333,12 +309,12 @@ __kernel void instance_norm_I32toF32_2D( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = gamma.s0 * mean_vari.s1; - float alpha = input_fl * scale_vari; + float alpha = scale_vari; bias_val = beta.s0 - scale_vari * mean_vari.s0; int4 data; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl index 8b82717..1494685 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/instance_normalization_u8.cl @@ -1,13 +1,10 @@ -__kernel void instance_norm_meanvari_U8( - __read_only image2d_array_t input, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int width, - int height +__kernel void instance_norm_sums_U8( + __read_only image2d_array_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int width, + int height ) { int gidx = get_global_id(0); @@ -16,9 +13,8 @@ __kernel void instance_norm_meanvari_U8( int4 coord = (int4)(gidx, 0, gidz, 0); uint4 data; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0; - float e2InScale = input_scale * input_scale; + float2 sum_x_x2 = 0; + int2 _sum_x_x2 = 0; __local float lcl_sum[16]; __local float lcl_sqr[16]; @@ -29,14 +25,13 @@ __kernel void instance_norm_meanvari_U8( { data = read_imageui(input, coord); coord.y++; - tmpSum += data.x; - tmpSqr += data.x * data.x; + _sum_x_x2.x = _sum_x_x2.x + data.x; + _sum_x_x2.y = _sum_x_x2.y + data.x * data.x; } - sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; - sum = (tmpSum - height * input_zp) * input_scale; + sum_x_x2 = convert_float2(_sum_x_x2); } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; + lcl_sum[lidx] = sum_x_x2.x; + lcl_sqr[lidx] = sum_x_x2.y; barrier(CLK_LOCAL_MEM_FENCE); int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); @@ -46,7 +41,7 @@ __kernel void instance_norm_meanvari_U8( __local float4* tmp_sum = (__local float4*)lcl_sum; __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; + float sum = 0, sqr = 0; for(int i = 0; i < 4; i++) { sum += dot(tmp_sum[i], one); @@ -62,16 +57,13 @@ __kernel void instance_norm_meanvari_U8( } } -__kernel void instance_norm_meanvari_U8_2D( - __read_only image2d_t input, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int width, - int height +__kernel void instance_norm_sums_U8_2D( + __read_only image2d_t input, + __write_only image2d_t output, + float eps, + int rsFlg, + int width, + int height ) { int gidx = get_global_id(0); @@ -81,9 +73,8 @@ __kernel void instance_norm_meanvari_U8_2D( int2 coord = (int2)(gidx, gidy); uint4 data; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0; - float e2InScale = input_scale * input_scale; + float2 sum_x_x2 = 0; + int2 _sum_x_x2 = 0; __local float lcl_sum[16]; __local float lcl_sqr[16]; @@ -95,14 +86,13 @@ __kernel void instance_norm_meanvari_U8_2D( { data = read_imageui(input, coord); coord.y++; - tmpSum += data.x; - tmpSqr += data.x * data.x; + _sum_x_x2.x = _sum_x_x2.x + data.x; + _sum_x_x2.y = _sum_x_x2.y + data.x * data.x; } - sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; - sum = (tmpSum - height * input_zp) * input_scale; + sum_x_x2 = convert_float2(_sum_x_x2); } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; + lcl_sum[lidx] = sum_x_x2.x; + lcl_sqr[lidx] = sum_x_x2.y; barrier(CLK_LOCAL_MEM_FENCE); int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); @@ -112,7 +102,7 @@ __kernel void instance_norm_meanvari_U8_2D( __local float4* tmp_sum = (__local float4*)lcl_sum; __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; + float sum = 0, sqr = 0; for(int i = 0; i < 4; i++) { sum += dot(tmp_sum[i], one); @@ -129,23 +119,19 @@ __kernel void instance_norm_meanvari_U8_2D( } __kernel void instance_norm_U8toU8( - __read_only image2d_array_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_array_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -156,7 +142,6 @@ __kernel void instance_norm_U8toU8( float4 beta = read_imagef(bias, coord_para.yx); float4 mean_vari = (float4)(0); float scale_vari, bias_val; - float scale_inOut = input_scale * output_scale; for(int i = 0; i < group_num; i++) { @@ -165,19 +150,18 @@ __kernel void instance_norm_U8toU8( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = gamma.s0 * mean_vari.s1; - float alpha = scale_inOut * scale_vari; + float alpha = output_scale * scale_vari; bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; uint4 data, dst; for(coord.y = 0; coord.y < height;coord.y++) { data = read_imageui(input, coord); - data.x -= input_zp; float4 norm; norm.x = data.x * alpha + bias_val; @@ -187,23 +171,19 @@ __kernel void instance_norm_U8toU8( } __kernel void instance_norm_U8toU8_2D( - __read_only image2d_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -216,7 +196,6 @@ __kernel void instance_norm_U8toU8_2D( float4 beta = read_imagef(bias, coord_para.yx); float4 mean_vari = (float4)(0); float scale_vari, bias_val; - float scale_inOut = input_scale * output_scale; for(int i = 0; i < group_num; i++) { @@ -225,19 +204,18 @@ __kernel void instance_norm_U8toU8_2D( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = gamma.s0 * mean_vari.s1; - float alpha = scale_inOut * scale_vari; + float alpha = output_scale * scale_vari; bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; uint4 data, dst; for(; coord.y < endH; coord.y++) { data = read_imageui(input, coord); - data.x -= input_zp; float4 norm; norm.x = data.x * alpha + bias_val; @@ -247,23 +225,19 @@ __kernel void instance_norm_U8toU8_2D( } __kernel void instance_norm_U8toF16( - __read_only image2d_array_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_array_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_array_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_array_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -274,7 +248,6 @@ __kernel void instance_norm_U8toF16( float4 beta = read_imagef(bias, coord_para.yx); float4 mean_vari = (float4)(0); float scale_vari, bias_val; - float scale_inOut = input_scale * output_scale; for(int i = 0; i < group_num; i++) { @@ -283,19 +256,18 @@ __kernel void instance_norm_U8toF16( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = gamma.s0 * mean_vari.s1; - float alpha = scale_inOut * scale_vari; + float alpha = output_scale * scale_vari; bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; uint4 data; for(coord.y = 0; coord.y < height;coord.y++) { data = read_imageui(input, coord); - data.x -= input_zp; float4 norm; norm.x = data.x * alpha + bias_val; @@ -304,23 +276,19 @@ __kernel void instance_norm_U8toF16( } __kernel void instance_norm_U8toF16_2D( - __read_only image2d_t input, - __read_only image2d_t bias, - __read_only image2d_t scale, - __read_only image2d_t meanVari, - __write_only image2d_t output, - float eps, - int rsFlg, - int input_zp, - float input_scale, - float input_fl, - int output_zp, - float output_scale, - float output_fl, - int width, - int height, - float dim_ratio, - int group_num + __read_only image2d_t input, + __read_only image2d_t bias, + __read_only image2d_t scale, + __read_only image2d_t meanVari, + __write_only image2d_t output, + float eps, + int rsFlg, + int output_zp, + float output_scale, + int width, + int height, + float inv_multiplier, + int group_num ) { int gidz = get_global_id(1); @@ -333,7 +301,6 @@ __kernel void instance_norm_U8toF16_2D( float4 beta = read_imagef(bias, coord_para.yx); float4 mean_vari = (float4)(0); float scale_vari, bias_val; - float scale_inOut = input_scale * output_scale; for(int i = 0; i < group_num; i++) { @@ -342,19 +309,18 @@ __kernel void instance_norm_U8toF16_2D( mean_vari.y += read_imagef(meanVari, coord_para.xy).x; coord_para.x+=3; } - mean_vari *= dim_ratio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = gamma.s0 * mean_vari.s1; - float alpha = scale_inOut * scale_vari; + float alpha = output_scale * scale_vari; bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; uint4 data; for(; coord.y < endH; coord.y++) { data = read_imageui(input, coord); - data.x -= input_zp; float4 norm; norm.x = data.x * alpha + bias_val; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax.cl new file mode 100644 index 0000000..0296e39 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax.cl @@ -0,0 +1,191 @@ +#define FP32_MIN -3.4e38 +#define I32_MIN -2147483647 + +__kernel void maxpoolwithargmax_F32toF32_I32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + __write_only image2d_array_t argmax, + int ksize_x, int ksize_y, int stride_x, int stride_y, + int pad_left, int pad_top, int width, int height, + float scale, float tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); + int4 coord_in = coord_out; + + int hstart = gidy * stride_y - pad_top; + int wstart = gidx * stride_x - pad_left; + int hend = min(hstart + ksize_y, height); + int wend = min(wstart + ksize_x, width); + int h, w; + int4 index_max = (int4)(0); + float value_max = FP32_MIN; + float4 dst = (float4)(0); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int2 coord_max = (int2)(wstart, hstart); + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + coord_in.xy = (int2)(w, h); + float4 data = read_imagef(input, coord_in); + + if (data.x > value_max) + { + value_max = data.x; + coord_max = coord_in.xy; + } + } + } + + index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height; + dst.x = value_max; + write_imagef(output, coord_out, dst); + write_imagei(argmax, coord_out, index_max); +} + +__kernel void maxpoolwithargmax_BF16toBF16_I32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + __write_only image2d_array_t argmax, + int ksize_x, int ksize_y, int stride_x, int stride_y, + int pad_left, int pad_top, int width, int height, + float scale, float tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); + int4 coord_in = coord_out; + + int hstart = gidy * stride_y - pad_top; + int wstart = gidx * stride_x - pad_left; + int hend = min(hstart + ksize_y, height); + int wend = min(wstart + ksize_x, width); + int h, w; + int4 index_max = (int4)(0); + float value_max = FP32_MIN; + uint4 dst = (uint4)(0); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int2 coord_max = (int2)(wstart, hstart); + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + coord_in.xy = (int2)(w, h); + uint4 src = read_imageui(input, coord_in); + src = src << 16; + float4 data; + _viv_asm(COPY, data, src, 16); + + if (data.x > value_max) + { + value_max = data.x; + coord_max = coord_in.xy; + } + } + } + + index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height; + _viv_asm(COPY, dst, value_max, 4); + dst.x = dst.x >> 16; + write_imageui(output, coord_out, dst); + write_imagei(argmax, coord_out, index_max); +} + +__kernel void maxpoolwithargmax_U32toU32_I32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + __write_only image2d_array_t argmax, + int ksize_x, int ksize_y, int stride_x, int stride_y, + int pad_left, int pad_top, int width, int height, + float scale, float tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); + int4 coord_in = coord_out; + + int hstart = gidy * stride_y - pad_top; + int wstart = gidx * stride_x - pad_left; + int hend = min(hstart + ksize_y, height); + int wend = min(wstart + ksize_x, width); + int h, w; + int4 index_max = (int4)(0); + uint value_max = 0; + uint4 dst = (uint4)(0); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + + int2 coord_max = (int2)(wstart, hstart); + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + coord_in.xy = (int2)(w, h); + uint4 data = read_imageui(input, coord_in); + + if (data.x > value_max) + { + value_max = data.x; + coord_max = coord_in.xy; + } + } + } + + index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height; + dst.x = convert_uint(convert_float(value_max) * scale + tail); + write_imageui(output, coord_out, dst); + write_imagei(argmax, coord_out, index_max); +} + +__kernel void maxpoolwithargmax_I32toI32_I32( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + __write_only image2d_array_t argmax, + int ksize_x, int ksize_y, int stride_x, int stride_y, + int pad_left, int pad_top, int width, int height, + float scale, float tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); + int4 coord_in = coord_out; + + int hstart = gidy * stride_y - pad_top; + int wstart = gidx * stride_x - pad_left; + int hend = min(hstart + ksize_y, height); + int wend = min(wstart + ksize_x, width); + int h, w; + int4 index_max = (int4)(0); + int value_max = I32_MIN; + int4 dst = (int4)(0); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int2 coord_max = (int2)(wstart, hstart); + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + coord_in.xy = (int2)(w, h); + int4 data = read_imagei(input, coord_in); + + if (data.x > value_max) + { + value_max = data.x; + coord_max = coord_in.xy; + } + } + } + + index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height; + dst.x = convert_int(convert_float(value_max) * scale + tail); + write_imagei(output, coord_out, dst); + write_imagei(argmax, coord_out, index_max); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax_2d.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax_2d.cl new file mode 100644 index 0000000..33bd14f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/maxpoolwithargmax_2d.cl @@ -0,0 +1,190 @@ +#define FP32_MIN -3.4e38 +#define I32_MIN -2147483647 + +__kernel void maxpoolwithargmax_F32toF32_I32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t argmax, + int ksize_x, int ksize_y, int stride_x, int stride_y, + int pad_left, int pad_top, int width, int height, + float scale, float tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int2 coord_out = (int2)(gidx, gidy); + int2 coord_in = coord_out; + + int hstart = gidy * stride_y - pad_top; + int wstart = gidx * stride_x - pad_left; + int hend = min(hstart + ksize_y, height); + int wend = min(wstart + ksize_x, width); + int h, w; + int4 index_max = (int4)(0); + float value_max = FP32_MIN; + float4 dst = (float4)(0); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int2 coord_max = (int2)(wstart, hstart); + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + coord_in.xy = (int2)(w, h); + float4 data = read_imagef(input, coord_in); + + if (data.x > value_max) + { + value_max = data.x; + coord_max = coord_in; + } + } + } + + index_max.x = coord_max.x + coord_max.y * width; + dst.x = value_max; + write_imagef(output, coord_out, dst); + write_imagei(argmax, coord_out, index_max); +} + +__kernel void maxpoolwithargmax_BF16toBF16_I32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t argmax, + int ksize_x, int ksize_y, int stride_x, int stride_y, + int pad_left, int pad_top, int width, int height, + float scale, float tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int2 coord_out = (int2)(gidx, gidy); + int2 coord_in = coord_out; + + int hstart = gidy * stride_y - pad_top; + int wstart = gidx * stride_x - pad_left; + int hend = min(hstart + ksize_y, height); + int wend = min(wstart + ksize_x, width); + int h, w; + int4 index_max = (int4)(0); + float value_max = FP32_MIN; + uint4 dst = (uint4)(0); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int2 coord_max = (int2)(wstart, hstart); + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + coord_in.xy = (int2)(w, h); + uint4 src = read_imageui(input, coord_in); + src = src << 16; + float4 data; + _viv_asm(COPY, data, src, 16); + + if (data.x > value_max) + { + value_max = data.x; + coord_max = coord_in; + } + } + } + + index_max.x = coord_max.x + coord_max.y * width; + _viv_asm(COPY, dst, value_max, 4); + dst.x = dst.x >> 16; + write_imageui(output, coord_out, dst); + write_imagei(argmax, coord_out, index_max); +} + +__kernel void maxpoolwithargmax_U32toU32_I32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t argmax, + int ksize_x, int ksize_y, int stride_x, int stride_y, + int pad_left, int pad_top, int width, int height, + float scale, float tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int2 coord_out = (int2)(gidx, gidy); + int2 coord_in = coord_out; + + int hstart = gidy * stride_y - pad_top; + int wstart = gidx * stride_x - pad_left; + int hend = min(hstart + ksize_y, height); + int wend = min(wstart + ksize_x, width); + int h, w; + int4 index_max = (int4)(0); + uint value_max = 0; + uint4 dst = (uint4)(0); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int2 coord_max = (int2)(wstart, hstart); + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + coord_in.xy = (int2)(w, h); + uint4 data = read_imageui(input, coord_in); + + if (data.x > value_max) + { + value_max = data.x; + coord_max = coord_in; + } + } + } + + index_max.x = coord_max.x + coord_max.y * width; + dst.x = convert_uint(convert_float(value_max) * scale + tail); + write_imageui(output, coord_out, dst); + write_imagei(argmax, coord_out, index_max); +} + +__kernel void maxpoolwithargmax_I32toI32_I32_2D( + __read_only image2d_t input, + __write_only image2d_t output, + __write_only image2d_t argmax, + int ksize_x, int ksize_y, int stride_x, int stride_y, + int pad_left, int pad_top, int width, int height, + float scale, float tail) +{ + int gidx = get_global_id(0); + int gidy = get_global_id(1); + int2 coord_out = (int2)(gidx, gidy); + int2 coord_in = coord_out; + + int hstart = gidy * stride_y - pad_top; + int wstart = gidx * stride_x - pad_left; + int hend = min(hstart + ksize_y, height); + int wend = min(wstart + ksize_x, width); + int h, w; + int4 index_max = (int4)(0); + int value_max = I32_MIN; + int4 dst = (int4)(0); + + hstart = max(hstart, 0); + wstart = max(wstart, 0); + int2 coord_max = (int2)(wstart, hstart); + for (h = hstart; h < hend; ++ h) + { + for (w = wstart; w < wend; ++ w) + { + coord_in.xy = (int2)(w, h); + int4 data = read_imagei(input, coord_in); + + if (data.x > value_max) + { + value_max = data.x; + coord_max = coord_in; + } + } + } + + index_max.x = coord_max.x + coord_max.y * width; + dst.x = convert_int(convert_float(value_max) * scale + tail); + write_imagei(output, coord_out, dst); + write_imagei(argmax, coord_out, index_max); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/mod.cl b/src/tim/vx/internal/src/libnnext/ops/cl/mod.cl new file mode 100644 index 0000000..42649c9 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/mod.cl @@ -0,0 +1,306 @@ +__kernel void mod_F32F32toF32 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + float4 src0; + float4 src1; + READ_IMAGEF_2DARRAY(src0, input, coord); + READ_IMAGEF_2DARRAY(src1, input1, coord); + float4 dst = fmod(src0, src1); + write_imagef(output, coord, dst); +} + +__kernel void mod_F32F32toF32_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + float4 src0 = read_imagef(input, coord); + float4 src1 = read_imagef(input1, coord); + float4 dst = fmod(src0, src1); + write_imagef(output, coord, dst); +} + +__kernel void mod_I32I32toI32 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 src0; + int4 src1; + READ_IMAGEI_2DARRAY(src0, input, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); + float4 in0 = convert_float4(src0) * input0Scale + input0Tail; + float4 in1 = convert_float4(src1) * input1Scale + input1Tail; + float4 out; + if (isfmod) + { + out = fmod(in0, in1) * outputScale + outputTail; + } + else + { + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail; + } + int4 dst = convert_int4(out); + write_imagei(output, coord, dst); +} + +__kernel void mod_I32I32toI32_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 src0 = read_imagei(input, coord); + int4 src1 = read_imagei(input1, coord); + float4 in0 = convert_float4(src0) * input0Scale + input0Tail; + float4 in1 = convert_float4(src1) * input1Scale + input1Tail; + float4 out; + if (isfmod) + { + out = fmod(in0, in1) * outputScale + outputTail; + } + else + { + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail; + } + int4 dst = convert_int4(out); + write_imagei(output, coord, dst); +} + +__kernel void mod_I32I32toU8 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 src0; + int4 src1; + READ_IMAGEI_2DARRAY(src0, input, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); + float4 in0 = convert_float4(src0) * input0Scale + input0Tail; + float4 in1 = convert_float4(src1) * input1Scale + input1Tail; + float4 out; + if (isfmod) + { + out = fmod(in0, in1) * outputScale + outputTail; + } + else + { + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail; + } + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} + +__kernel void mod_I32I32toU8_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + int4 src0 = read_imagei(input, coord); + int4 src1 = read_imagei(input1, coord); + float4 in0 = convert_float4(src0) * input0Scale + input0Tail; + float4 in1 = convert_float4(src1) * input1Scale + input1Tail; + float4 out; + if (isfmod) + { + out = fmod(in0, in1) * outputScale + outputTail; + } + else + { + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail; + } + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} + +__kernel void mod_U8U8toU8 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + uint4 src0, src1; + float4 in0, in1, out; + READ_IMAGEUI_2DARRAY(src0, input, coord); + READ_IMAGEUI_2DARRAY(src1, input1, coord); + in0 = convert_float4(src0) * input0Scale + input0Tail; + in1 = convert_float4(src1) * input1Scale + input1Tail; + if (isfmod) + { + out = fmod(in0, in1) * outputScale + outputTail; + } + else + { + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail; + } + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} + +__kernel void mod_U8U8toU8_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + uint4 src0 = read_imageui(input, coord); + uint4 src1 = read_imageui(input1, coord); + float4 in0, in1, out; + in0 = convert_float4(src0) * input0Scale + input0Tail; + in1 = convert_float4(src1) * input1Scale + input1Tail; + if (isfmod) + { + out = fmod(in0, in1) * outputScale + outputTail; + } + else + { + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail; + } + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} + +__kernel void mod_U8I32toU8 + ( + __read_only image2d_array_t input, + __read_only image2d_array_t input1, + __write_only image2d_array_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + uint4 src0; + int4 src1; + float4 in0, in1, out; + READ_IMAGEUI_2DARRAY(src0, input, coord); + READ_IMAGEI_2DARRAY(src1, input1, coord); + in0 = convert_float4(src0) * input0Scale + input0Tail; + in1 = convert_float4(src1); + if (isfmod) + { + out = fmod(in0, in1) * outputScale + outputTail; + } + else + { + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail; + } + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} + +__kernel void mod_U8I32toU8_2D + ( + __read_only image2d_t input, + __read_only image2d_t input1, + __write_only image2d_t output, + int isfmod, + float input0Scale, + float input0Tail, + float input1Scale, + float input1Tail, + float outputScale, + float outputTail + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + uint4 src0 = read_imageui(input, coord); + int4 src1 = read_imagei(input1, coord); + float4 in0, in1, out; + in0 = convert_float4(src0) * input0Scale + input0Tail; + in1 = convert_float4(src1); + if (isfmod) + { + out = fmod(in0, in1) * outputScale + outputTail; + } + else + { + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail; + } + uint4 dst = convert_uint4(out); + write_imageui(output, coord, dst); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl index feef55a..b2d6aae 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl @@ -45,21 +45,25 @@ inline float roi_align_1x1 #define EPS_GRID 0.00001f -__kernel void roi_align_F32toF32 +__kernel void roi_align_F32_F32toF32 ( - __read_only image2d_array_t input, - __read_only image2d_t rois, - __read_only image2d_t n_rois, - __write_only image2d_array_t output, - float spatial_x_scale, - float spatial_y_scale, - float in_width, - float in_height, - float rcp_of_out_width, - float rcp_of_out_height, - float sampling_x_ratio, - float sampling_y_ratio, - int depth + __read_only image2d_array_t input, + __read_only image2d_t rois, + __read_only image2d_t n_rois, + __write_only image2d_array_t output, + float input_scale, + float input_tail, + float output_scale, + float output_zp, + float spatial_x_scale, + float spatial_y_scale, + float in_width, + float in_height, + float rcp_of_out_width, + float rcp_of_out_height, + float sampling_x_ratio, + float sampling_y_ratio, + int depth ) { int px = get_global_id(0); @@ -105,4 +109,126 @@ __kernel void roi_align_F32toF32 write_imagef(output, (int4)(px, py, kz1, 0), interp); } +} + +inline float roi_align_1x1_U8toF32 +( + __read_only image2d_array_t input, + float input_scale, + float input_tail, + float2 region_start, + float2 region_end, + float2 bin_size, + int2 grid_size, + float2 rcp_of_grid_size, + int pz +) +{ + float sum = 0; + + for(int iy = 0; iy < grid_size.y; ++iy) + { + for(int ix = 0; ix < grid_size.x; ++ix) + { + float2 ixy = (float2)(ix + 0.5f, iy + 0.5f); + float2 pos = region_start + ixy * bin_size * rcp_of_grid_size; + + int2 xy_low = convert_int2(pos); + int2 xy_high = xy_low + 1; + + float ly = pos.y - xy_low.y; + float lx = pos.x - xy_low.x; + float hy = 1.0f - ly; + float hx = 1.0f - lx; + + float w1 = hy * hx; + float w2 = hy * lx; + float w3 = ly * hx; + float w4 = ly * lx; + + uint4 data; + data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x; + data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x; + data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x; + data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x; + + float4 value = convert_float4(data) * input_scale + input_tail; + + sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w; + } + } + + return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y); +} + +__kernel void roi_align_U8_U16toU8 +( + __read_only image2d_array_t input, + __read_only image2d_t rois, + __read_only image2d_t n_rois, + __write_only image2d_array_t output, + float input_scale, + float input_tail, + float output_scale, + float output_zp, + float spatial_x_scale, + float spatial_y_scale, + float in_width, + float in_height, + float rcp_of_out_width, + float rcp_of_out_height, + float sampling_x_ratio, + float sampling_y_ratio, + int depth +) +{ + int px = get_global_id(0); + int py = get_global_id(1); + int pw = get_global_id(2); + + int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x; + float4 roi_x = convert_float4(read_imageui(rois, (int2)(0, pw))); + float4 roi_y = convert_float4(read_imageui(rois, (int2)(1, pw))); + float4 roi_z = convert_float4(read_imageui(rois, (int2)(2, pw))); + float4 roi_w = convert_float4(read_imageui(rois, (int2)(3, pw))); + float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x); + + float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale); + float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f); + + float2 spatial_indx = (float2)(px, py); + float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height); + float2 max_spatial_dims = (float2)(in_width, in_height); + + float2 bin_size = roi_dims * pooled_dims; + float2 region_start = spatial_indx * bin_size + roi_anchor.xy; + float2 region_end = region_start + bin_size; + + float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio); + + roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid; + + int kz = roi_batch * depth; + float2 rcp_of_grid_size = 1.0f / roi_bin_grid; + int2 grid_size_xy = convert_int2(roi_bin_grid); + float4 interp; + int kz1 = pw * depth; + for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++) + { + interp.x = roi_align_1x1_U8toF32( input, + input_scale, + input_tail, + region_start, + region_end, + bin_size, + grid_size_xy, + rcp_of_grid_size, + kz); + + uint4 dst; + interp.x = interp.x * output_scale + output_zp; + interp.x = interp.x < 255 ? interp.x : 255; + dst.x = convert_uint_rte(interp.x); + write_imageui(output, (int4)(px, py, kz1, 0), dst.xxxx); + } } \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl new file mode 100644 index 0000000..0b8f988 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl @@ -0,0 +1,327 @@ +#define LOCAL_SIZE_X (32) +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32 + ( + __read_only image2d_t input, + image2d_t input_t, + image2d_t indices_t, + __write_only image2d_t output, + __write_only image2d_t indices, + int width + ) + { + uint lid = get_local_id(0); + uint work_group_size = get_local_size(0); + uint offset = 0; + + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X) + { + float4 data = read_imagef(input, coord.xy); + + write_imagef(input_t, coord.xy, data); + write_imagei(indices_t, coord.xy, coord.xxxx); + } + + __local int sorted[1]; + int width_minus_one = width - 1; + int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X; + num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1); + + int x_start = lid * num_pixels_per_thread; + int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one); + + sorted[0] = 0; + + while (1) + { + if (lid == 0) + { + *sorted = 0; + } + int swapped = 0; + barrier(CLK_GLOBAL_MEM_FENCE); + + // odd-even + coord.x = x_start; + coord.z = x_start + 1; + for (; coord.x < x_end; ) + { + float4 left = read_imagef(input_t, coord.xy); + float4 right = read_imagef(input_t, coord.zy); + + if (left.x < right.x) + { + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); + swapped = 1; + + write_imagef(input_t, coord.xy, right); + write_imagef(input_t, coord.zy, left); + + write_imagei(indices_t, coord.xy, r_index); + write_imagei(indices_t, coord.zy, l_index); + } + + coord.xz = coord.xz + 2; + } + + // even-odd + coord.x = x_start + 1; + coord.z = x_start + 2; + for (; coord.x < x_end; ) + { + float4 left = read_imagef(input_t, coord.xy); + float4 right = read_imagef(input_t, coord.zy); + + if (left.x < right.x) + { + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); + swapped = 1; + + write_imagef(input_t, coord.xy, right); + write_imagef(input_t, coord.zy, left); + + write_imagei(indices_t, coord.xy, r_index); + write_imagei(indices_t, coord.zy, l_index); + } + + coord.xz = coord.xz + 2; + } + + atomic_add(sorted, swapped); + barrier(CLK_GLOBAL_MEM_FENCE); + + if (*sorted == 0) + break; + barrier(CLK_GLOBAL_MEM_FENCE); + } + + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X) + { + float4 data = read_imagef(input_t, coord.xy); + int4 index = read_imagei(indices_t, coord.xy); + + write_imagef(output, coord.xy, data); + write_imagei(indices, coord.xy, index); + } +} + +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_U32toU32_I32 + ( + __read_only image2d_t input, + image2d_t input_t, + image2d_t indices_t, + __write_only image2d_t output, + __write_only image2d_t indices, + int width + ) + { + uint lid = get_local_id(0); + uint work_group_size = get_local_size(0); + uint offset = 0; + + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X) + { + uint4 data = read_imageui(input, coord.xy); + + write_imageui(input_t, coord.xy, data); + write_imagei(indices_t, coord.xy, coord.xxxx); + } + + __local int sorted[1]; + int width_minus_one = width - 1; + int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X; + num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1); + + int x_start = lid * num_pixels_per_thread; + int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one); + + sorted[0] = 0; + + while (1) + { + if (lid == 0) + { + *sorted = 0; + } + int swapped = 0; + barrier(CLK_GLOBAL_MEM_FENCE); + + // odd-even + coord.x = x_start; + coord.z = x_start + 1; + for (; coord.x < x_end; ) + { + uint4 left = read_imageui(input_t, coord.xy); + uint4 right = read_imageui(input_t, coord.zy); + + if (left.x < right.x) + { + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); + swapped = 1; + + write_imageui(input_t, coord.xy, right); + write_imageui(input_t, coord.zy, left); + + write_imagei(indices_t, coord.xy, r_index); + write_imagei(indices_t, coord.zy, l_index); + } + + coord.xz = coord.xz + 2; + } + + // even-odd + coord.x = x_start + 1; + coord.z = x_start + 2; + for (; coord.x < x_end; ) + { + uint4 left = read_imageui(input_t, coord.xy); + uint4 right = read_imageui(input_t, coord.zy); + + if (left.x < right.x) + { + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); + swapped = 1; + + write_imageui(input_t, coord.xy, right); + write_imageui(input_t, coord.zy, left); + + write_imagei(indices_t, coord.xy, r_index); + write_imagei(indices_t, coord.zy, l_index); + } + + coord.xz = coord.xz + 2; + } + + atomic_add(sorted, swapped); + barrier(CLK_GLOBAL_MEM_FENCE); + + if (*sorted == 0) + break; + barrier(CLK_GLOBAL_MEM_FENCE); + } + + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X) + { + uint4 data = read_imageui(input_t, coord.xy); + int4 index = read_imagei(indices_t, coord.xy); + + write_imageui(output, coord.xy, data); + write_imagei(indices, coord.xy, index); + } +} + +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_I32toI32_I32 + ( + __read_only image2d_t input, + image2d_t input_t, + image2d_t indices_t, + __write_only image2d_t output, + __write_only image2d_t indices, + int width + ) + { + uint lid = get_local_id(0); + uint work_group_size = get_local_size(0); + uint offset = 0; + + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); + + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X) + { + int4 data = read_imagei(input, coord.xy); + + write_imagei(input_t, coord.xy, data); + write_imagei(indices_t, coord.xy, coord.xxxx); + } + + __local int sorted[1]; + int width_minus_one = width - 1; + int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X; + num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1); + + int x_start = lid * num_pixels_per_thread; + int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one); + + sorted[0] = 0; + + while (1) + { + if (lid == 0) + { + *sorted = 0; + } + int swapped = 0; + barrier(CLK_GLOBAL_MEM_FENCE); + + // odd-even + coord.x = x_start; + coord.z = x_start + 1; + for (; coord.x < x_end; ) + { + int4 left = read_imagei(input_t, coord.xy); + int4 right = read_imagei(input_t, coord.zy); + + if (left.x < right.x) + { + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); + swapped = 1; + + write_imagei(input_t, coord.xy, right); + write_imagei(input_t, coord.zy, left); + + write_imagei(indices_t, coord.xy, r_index); + write_imagei(indices_t, coord.zy, l_index); + } + + coord.xz = coord.xz + 2; + } + + // even-odd + coord.x = x_start + 1; + coord.z = x_start + 2; + for (; coord.x < x_end; ) + { + int4 left = read_imagei(input_t, coord.xy); + int4 right = read_imagei(input_t, coord.zy); + + if (left.x < right.x) + { + int4 l_index = read_imagei(indices_t, coord.xy); + int4 r_index = read_imagei(indices_t, coord.zy); + swapped = 1; + + write_imagei(input_t, coord.xy, right); + write_imagei(input_t, coord.zy, left); + + write_imagei(indices_t, coord.xy, r_index); + write_imagei(indices_t, coord.zy, l_index); + } + + coord.xz = coord.xz + 2; + } + + atomic_add(sorted, swapped); + barrier(CLK_GLOBAL_MEM_FENCE); + + if (*sorted == 0) + break; + barrier(CLK_GLOBAL_MEM_FENCE); + } + + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X) + { + int4 data = read_imagei(input_t, coord.xy); + int4 index = read_imagei(indices_t, coord.xy); + + write_imagei(output, coord.xy, data); + write_imagei(indices, coord.xy, index); + } +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum.vx new file mode 100644 index 0000000..fad3ad2 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum.vx @@ -0,0 +1,262 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8; +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4; +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; +_viv_uniform int input_zp; +_viv_uniform float in_out_scale; +_viv_uniform float in_out_zp_scale; +_viv_uniform float output_zp; + +__kernel void cumsum_F16toF16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + for(coord.z = 0; coord.z < channel; coord.z++) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +#define CUMSUM_8BITS_AXIS2(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_##in_name##to##out_name##_axis2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \ + \ + for(coord.z = 0; coord.z < channel; coord.z++) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CUMSUM_8BITS_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_AXIS2(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_I16toI16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + + for(coord.z = 0; coord.z < channel; coord.z++) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel void cumsum_F16toF16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +#define CUMSUM_8BITS_AXIS1(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_##in_name##to##out_name##_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CUMSUM_8BITS_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_AXIS1(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_I16toI16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); + + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel void cumsum_F16toF16_axis0( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_short8 src, dst; + vxc_half8 data, tmpsum, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + for(; coord.x < width; coord.x += 8) + { + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data, src, 16); + + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +#define CUMSUM_QINT_AXIS0(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_##in_name##to##out_name##_axis0( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + src_type src; \ + dst_type dst; \ + vxc_short8 rowSum; \ + int4 sum0 = (int4)(0), sum1 = (int4)(0); \ + short zp = (short)input_zp; \ + \ + for(; coord.x < width; coord.x += 8) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \ + \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} + +CUMSUM_QINT_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_QINT_AXIS0(I8, I8, vxc_char16, vxc_char16) +CUMSUM_QINT_AXIS0(I16, I16, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_2d.vx new file mode 100644 index 0000000..c54bb35 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_2d.vx @@ -0,0 +1,204 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4; +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; + +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8; +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4; +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4; +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int input_zp; +_viv_uniform float in_out_scale; +_viv_uniform float in_out_zp_scale; +_viv_uniform float output_zp; + +__kernel void cumsum_F16toF16_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + vxc_short8 src, dst; + vxc_half8 data, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + + for(; coord.y < height; coord.y++) + { + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data, src, 16); + + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +#define CUMSUM_8BITS_AXIS1_2D(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_##in_name##to##out_name##_axis1_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + src_type src; \ + dst_type dst; \ + int4 sum0 = (int4)(0); \ + int4 sum1 = (int4)(0); \ + int4 sum2 = (int4)(0); \ + int4 sum3 = (int4)(0); \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32A_4x4); \ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32B_4x4); \ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32C_4x4); \ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32D_4x4); \ + \ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ +} + +CUMSUM_8BITS_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_8BITS_AXIS1_2D(I8, I8, vxc_char16, vxc_char16) + +__kernel void cumsum_I16toI16_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src, dst; + int4 sum0 = (int4)(0), sum1 = (int4)(0); + + for(coord.y = 0; coord.y < height; coord.y++) + { + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32A_4x4); + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertU8toI32B_4x4); + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; + int4 tmpDst0 = convert_int4_rte(tmpSum0); + int4 tmpDst1 = convert_int4_rte(tmpSum1); + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), + uniConvertInt32toUint8_2x8); + + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel void cumsum_F16toF16_axis0_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_short8 src, dst; + vxc_half8 data, tmpsum, sum; + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); + for(; coord.x < width; coord.x += 8) + { + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + _viv_asm(COPY, data, src, 16); + + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16A_4x4); + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16B_4x4); + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16C_2x8); + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAccSumHorzF16toF16_2x8); + _viv_asm(COPY, dst, sum, 16); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +#define CUMSUM_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type) \ +__kernel void cumsum_##in_name##to##out_name##_axis0_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + src_type src; \ + dst_type dst; \ + vxc_short8 rowSum; \ + int4 sum0, sum1; \ + sum0 ^= sum0; \ + sum1 ^= sum1; \ + short zp = (short)input_zp; \ + \ + for(; coord.x < width; coord.x += 8) \ + { \ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzU8toI16A_4x4); \ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzU8toI16B_8x4); \ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniSubZpI16toI16_2x8); \ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumHorzI16toI32A_4x4); \ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniAccSumHorzI16toI32B_4x4); \ + \ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \ + \ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} + +CUMSUM_QINT_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16) +CUMSUM_QINT_AXIS0_2D(I8, I8, vxc_char16, vxc_char16) +CUMSUM_QINT_AXIS0_2D(I16, I16, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_bf16.vx new file mode 100644 index 0000000..5d45e73 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_bf16.vx @@ -0,0 +1,188 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; + +__kernel void cumsum_BF16toBF16_axis2( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float4 sum0 = (float4)(0), sum1 = (float4)(0); + + for(coord.z = 0; coord.z < channel; coord.z++) + { + float4 data0, data1; + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + + sum0 += data0; + sum1 += data1; + _viv_asm(COPY, dst0, sum0, 16); + _viv_asm(COPY, dst1, sum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel void cumsum_BF16toBF16_axis1( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float4 sum0 = (float4)(0), sum1 = (float4)(0); + + for(coord.y = 0; coord.y < height; coord.y++) + { + float4 data0, data1; + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + sum0 += data0; + sum1 += data1; + _viv_asm(COPY, dst0, sum0, 16); + _viv_asm(COPY, dst1, sum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel void cumsum_BF16toBF16_axis0( + __read_only image2d_array_t input, + __write_only image2d_array_t output, + int axis, int exclusive, int rev + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float preSum = 0; + float4 one = (float4)(1.0, 1.0, 1.0, 1.0); + float4 q = (float4)(1.0, 1.0, 1.0, 0); + + for(; coord.x < width; coord.x += 8) + { + float4 data0, data1; + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + + float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one)); + float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one)); + tmpSum1 += tmpSum0.w; + + tmpSum0 += preSum; + tmpSum1 += preSum; + + preSum = tmpSum1.w; + + _viv_asm(COPY, dst0, tmpSum0, 16); + _viv_asm(COPY, dst1, tmpSum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel void cumsum_BF16toBF16_axis1_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), 0); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float4 sum0 = (float4)(0), sum1 = (float4)(0); + + for(; coord.y < height; coord.y++) + { + float4 data0, data1; + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + + sum0 += data0; + sum1 += data1; + + _viv_asm(COPY, dst0, sum0, 16); + _viv_asm(COPY, dst1, sum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtractOddData_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + +__kernel void cumsum_BF16toBF16_axis0_2D( + __read_only image2d_t input, + __write_only image2d_t output, + int axis, int exclusive, int rev + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_ushort8 src, val0, val1; + vxc_ushort8 dst0, dst1, dst; + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0); + float preSum = 0; + float4 one = (float4)(1.0, 1.0, 1.0, 1.0); + float4 q = (float4)(1.0, 1.0, 1.0, 0); + + for(; coord.x < width; coord.x += 8) + { + float4 data0, data1; + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part0_2x8); + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, data0, val0, 16); + _viv_asm(COPY, data1, val1, 16); + + float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one)); + float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one)); + tmpSum1 += tmpSum0.w; + + tmpSum0 += preSum; + tmpSum1 += preSum; + + preSum = tmpSum1.w; + + _viv_asm(COPY, dst0, tmpSum0, 16); + _viv_asm(COPY, dst1, tmpSum1, 16); + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), + uniExtractOddData_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx new file mode 100644 index 0000000..b9f4e17 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_f16_u8.vx @@ -0,0 +1,178 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8; +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4; +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8; +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8; + +_viv_uniform VXC_512Bits uniSetZeroF16_2x8; + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform int channel; + +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8; + +#define CUMSUM_F16TOQINT_AXIS2(out_name, src_type, dst_type) \ +__kernel void cumsum_F16to##out_name##_axis2( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + for(coord.z = 0; coord.z < channel; coord.z++) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CUMSUM_F16TOQINT_AXIS2(I8, vxc_half8, vxc_char16) +CUMSUM_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8) +CUMSUM_F16TOQINT_AXIS2(U8, vxc_half8, vxc_uchar16) + + +#define CUMSUM_F16TOQINT_AXIS1(out_name, src_type, dst_type) \ +__kernel void cumsum_F16to##out_name##_axis1( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CUMSUM_F16TOQINT_AXIS1(I8, vxc_half8, vxc_char16) +CUMSUM_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8) +CUMSUM_F16TOQINT_AXIS1(U8, vxc_half8, vxc_uchar16) + +#define CUMSUM_F16TOQINT_AXIS0(out_name, src_type, dst_type) \ +__kernel void cumsum_F16to##out_name##_axis0( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, tmpsum, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + for(; coord.x < width; coord.x += 8) \ + { \ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CUMSUM_F16TOQINT_AXIS0(I8, vxc_half8, vxc_char16) +CUMSUM_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8) +CUMSUM_F16TOQINT_AXIS0(U8, vxc_half8, vxc_uchar16) + +#define CUMSUM_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type) \ +__kernel void cumsum_F16to##out_name##_axis1_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), 0); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + for(; coord.y < height; coord.y++) \ + { \ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAccSumVertF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CUMSUM_F16TOQINT_AXIS1_2D(I8, vxc_half8, vxc_char16) +CUMSUM_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8) +CUMSUM_F16TOQINT_AXIS1_2D(U8, vxc_half8, vxc_uchar16) + +#define CUMSUM_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type) \ +__kernel void cumsum_F16to##out_name##_axis0_2D( \ + __read_only image2d_t input, \ + __write_only image2d_t output, \ + int axis, int exclusive, int rev \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + vxc_short8 src; \ + dst_type dst; \ + vxc_half8 data, tmpsum, sum; \ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \ + vxc_ushort8 ms0; \ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \ + for(; coord.x < width; coord.x += 8) \ + { \ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data, src, 16); \ + \ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16A_4x4); \ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16B_4x4); \ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniSumHorzF16toF16C_2x8); \ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniAccSumHorzF16toF16_2x8); \ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift_0_Lo_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +CUMSUM_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16) +CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8) +CUMSUM_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx index 696807f..8b7a639 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx @@ -50,6 +50,22 @@ float4 eltwise_unary_celu(float4 val) return val < 0 ? x : val; } +float4 eltwise_unary_rcp(float4 val) +{ + return 1.0f / val; +} + +float4 eltwise_unary_sign(float4 val) +{ + return sign(val); +} + +float4 eltwise_unary_softsign(float4 val) +{ + float4 _rcp = 1.0f / (1.0f + fabs(val)); + return val * _rcp; +} + _viv_uniform float inputScale; _viv_uniform float inputTail; _viv_uniform float outputScale; @@ -94,83 +110,6 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4; _viv_asm(COPY, dst, dst2, 16); \ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } -//EXP -ELTSISE_UNARY_2D(exp, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(exp, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(exp, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(exp, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(exp, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(exp, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(exp, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(exp, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(exp, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//SIN -ELTSISE_UNARY_2D(sin, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(sin, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(sin, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(sin, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(sin, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(sin, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//COS -ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//LOG -ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(log, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(log, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(log, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(log, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(log, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(log, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(log, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//SELU -ELTSISE_UNARY_2D(selu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(selu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(selu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(selu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(selu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(selu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(selu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(selu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(selu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//NEG -ELTSISE_UNARY_2D(neg, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(neg, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(neg, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(neg, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(neg, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(neg, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(neg, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(neg, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(neg, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//CELU -ELTSISE_UNARY_2D(celu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(celu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(celu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(celu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(celu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_2D(celu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(celu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_2D(celu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_2D(celu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_2D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; @@ -205,17 +144,36 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8; VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } +#define ADD_ELTSISE_UNARY_2D(func_name) \ +ELTSISE_UNARY_2D(func_name, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) \ +ELTSISE_UNARY_2D(func_name, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) \ +ELTSISE_UNARY_2D(func_name, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) \ +ELTSISE_UNARY_2D(func_name, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) \ +ELTSISE_UNARY_2D(func_name, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) \ +ELTSISE_UNARY_2D(func_name, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) \ +ELTSISE_UNARY_2D(func_name, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) \ +ELTSISE_UNARY_2D(func_name, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) \ +ELTSISE_UNARY_2D(func_name, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) \ +ELTSISE_UNARY_2D(func_name, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) \ +ELTSISE_UNARY_BF16_2D(func_name) + //EXP -ELTSISE_UNARY_BF16_2D(exp) +ADD_ELTSISE_UNARY_2D(exp) //SIN -ELTSISE_UNARY_BF16_2D(sin) +ADD_ELTSISE_UNARY_2D(sin) //COS -ELTSISE_UNARY_BF16_2D(cos) +ADD_ELTSISE_UNARY_2D(cos) //LOG -ELTSISE_UNARY_BF16_2D(log) +ADD_ELTSISE_UNARY_2D(log) //SELU -ELTSISE_UNARY_BF16_2D(selu) +ADD_ELTSISE_UNARY_2D(selu) //NEG -ELTSISE_UNARY_BF16_2D(neg) +ADD_ELTSISE_UNARY_2D(neg) //CELU -ELTSISE_UNARY_BF16_2D(celu) +ADD_ELTSISE_UNARY_2D(celu) +//RCP +ADD_ELTSISE_UNARY_2D(rcp) +//SIGN +ADD_ELTSISE_UNARY_2D(sign) +//SOFTSIGN +ADD_ELTSISE_UNARY_2D(softsign) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx index d150e2a..df52777 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx @@ -50,6 +50,22 @@ float4 eltwise_unary_celu(float4 val) return val < 0 ? x : val; } +float4 eltwise_unary_rcp(float4 val) +{ + return 1.0f / val; +} + +float4 eltwise_unary_sign(float4 val) +{ + return sign(val); +} + +float4 eltwise_unary_softsign(float4 val) +{ + float4 _rcp = 1.0f / (1.0f + fabs(val)); + return val * _rcp; +} + _viv_uniform float inputScale; _viv_uniform float inputTail; _viv_uniform float outputScale; @@ -94,83 +110,6 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \ _viv_asm(COPY, dst, dst2, 16); \ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } -//EXP -ELTSISE_UNARY_3D(exp, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(exp, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(exp, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(exp, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(exp, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(exp, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(exp, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(exp, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(exp, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//SIN -ELTSISE_UNARY_3D(sin, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(sin, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(sin, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(sin, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(sin, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(sin, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//COS -ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//LOG -ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(log, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(log, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(log, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(log, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(log, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(log, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(log, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//SELU -ELTSISE_UNARY_3D(selu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(selu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(selu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(selu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(selu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(selu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(selu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(selu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(selu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//NEG -ELTSISE_UNARY_3D(neg, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(neg, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(neg, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(neg, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(neg, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(neg, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(neg, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(neg, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(neg, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) -//CELU -ELTSISE_UNARY_3D(celu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(celu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(celu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(celu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(celu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) -ELTSISE_UNARY_3D(celu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(celu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) -ELTSISE_UNARY_3D(celu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) -ELTSISE_UNARY_3D(celu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) -ELTSISE_UNARY_3D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; @@ -204,17 +143,36 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8; VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ } +#define ADD_ELTSISE_UNARY_3D(func_name) \ +ELTSISE_UNARY_3D(func_name, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) \ +ELTSISE_UNARY_3D(func_name, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) \ +ELTSISE_UNARY_3D(func_name, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) \ +ELTSISE_UNARY_3D(func_name, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) \ +ELTSISE_UNARY_3D(func_name, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) \ +ELTSISE_UNARY_3D(func_name, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) \ +ELTSISE_UNARY_3D(func_name, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) \ +ELTSISE_UNARY_3D(func_name, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) \ +ELTSISE_UNARY_3D(func_name, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) \ +ELTSISE_UNARY_3D(func_name, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) \ +ELTSISE_UNARY_BF16(func_name) + //EXP -ELTSISE_UNARY_BF16(exp) +ADD_ELTSISE_UNARY_3D(exp) //SIN -ELTSISE_UNARY_BF16(sin) +ADD_ELTSISE_UNARY_3D(sin) //COS -ELTSISE_UNARY_BF16(cos) +ADD_ELTSISE_UNARY_3D(cos) //LOG -ELTSISE_UNARY_BF16(log) +ADD_ELTSISE_UNARY_3D(log) //SELU -ELTSISE_UNARY_BF16(selu) +ADD_ELTSISE_UNARY_3D(selu) //NEG -ELTSISE_UNARY_BF16(neg) +ADD_ELTSISE_UNARY_3D(neg) //CELU -ELTSISE_UNARY_BF16(selu) +ADD_ELTSISE_UNARY_3D(celu) +//RCP +ADD_ELTSISE_UNARY_3D(rcp) +//SIGN +ADD_ELTSISE_UNARY_3D(sign) +//SOFTSIGN +ADD_ELTSISE_UNARY_3D(softsign) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx new file mode 100644 index 0000000..c1b970d --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_0.vx @@ -0,0 +1,370 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +_viv_uniform float input_scale; +_viv_uniform float input_scale2; +_viv_uniform float input_zp; +_viv_uniform float sum_x_tail; +_viv_uniform float sum_x2_tail0; +_viv_uniform float sum_x2_tail1; + +_viv_uniform VXC_512Bits uniSumX_16x1; +_viv_uniform VXC_512Bits uniSumX2_16x1; +_viv_uniform VXC_512Bits uniResetFp32_4x4; +_viv_uniform int group_stride; + +#define GROUP_NORM_SUMS_8BITS_IMPL(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float eps, int is2D) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(gidx, 0, gidz, 0); \ + src_type src0; \ + float2 sums_f32 = 0; \ + int2 sums = 0, sum_x_x2; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + if (gidx < width) \ + { \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + sums = sums + sum_x_x2; \ + } \ + sums_f32 = convert_float2(sums); \ + sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \ + sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \ + } \ + lcl_sum[lidx] = sums_f32.x; \ + lcl_sqr[lidx] = sums_f32.y; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + float sum_x = 0,sum_x2 = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum_x += dot(tmp_sum[i], one); \ + sum_x2 += dot(tmp_sqr[i], one); \ + } \ + float4 data = (float4)(sum_x, sum_x2, 0, 0); \ + write_imagef(output, coord_out, data); \ + } \ +} +GROUP_NORM_SUMS_8BITS_IMPL(U8, vxc_uchar16) +GROUP_NORM_SUMS_8BITS_IMPL(I8, vxc_char16) + +#define GROUP_NORM_SUMS_8BITS_IMPL_2D(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float eps, int is2D) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + \ + int2 coord = (int2)(gidx, get_global_id(1)); \ + src_type src0; \ + float2 sums = 0; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + if(gidx < width) \ + { \ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP16x1(sums, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sums, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + sums.y = sums.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums.x; \ + sums.x = sums.x * input_scale + sum_x_tail; \ + } \ + lcl_sum[lidx] = sums.x; \ + lcl_sqr[lidx] = sums.y; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + float sum_x = 0,sum_x2 = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum_x += dot(tmp_sum[i], one); \ + sum_x2 += dot(tmp_sqr[i], one); \ + } \ + float4 data = (float4)(sum_x, sum_x2, 0, 0); \ + write_imagef(output, coord_out, data); \ + } \ +} +GROUP_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16) +GROUP_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16) + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_means( + image2d_t input, image2d_t output, float eps, float group_ratio) +{ + int gidx = get_global_id(0); + int lidx = get_local_id(0); + + int2 coord = (int2)(gidx, get_global_id(1)); + vxc_uchar16 src0 = 1; + float2 sum_sqr = (float2)(0); + float4 mean_vari; + VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4); + + __local float2 lcl_data[16]; + __local float2 lcl_sum[4]; + + for(; coord.x < group_stride; coord.x += 64) + { + mean_vari += read_imagef(input, coord); + } + lcl_data[lidx] = mean_vari.xy; + barrier(CLK_LOCAL_MEM_FENCE); + if(lidx < 4) + { + float2 tmpSum = (float2)(0); + for(int i = lidx; i < 16; i+=4) + { + tmpSum += lcl_data[i]; + } + lcl_sum[lidx] = tmpSum; + } + barrier(CLK_LOCAL_MEM_FENCE); + if(lidx == 0) + { + for(int i = 0; i < 4; i++) + { + sum_sqr += lcl_sum[i]; + } + mean_vari.xy = sum_sqr * group_ratio; + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; + mean_vari.s1 = rsqrt(mean_vari.s1); + + coord.x = 0; + write_imagef(output, coord, mean_vari); + } +} + +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4; +#define CONVERT_INPUT_TO_F32() \ +VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ +VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ +VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ +VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); + +#define GROUP_NORM_8BITS_IMPL(name, src_type, dst_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_array_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + src_type src0; \ + dst_type dst; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f; \ + \ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + bias_f = read_imagef(bias, coord_para.xy); \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + float alpha = input_scale * output_scale * scale_vari; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + \ + CONVERT_INPUT_TO_F32() \ + norm = tmpData0 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData1 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + norm = tmpData2 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData3 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_8BITS_IMPL(U8_F16toU8, vxc_uchar16, vxc_uchar16) +GROUP_NORM_8BITS_IMPL(I8_F16toI8, vxc_char16, vxc_char16) + +#define GROUP_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_array_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidz = get_global_id(1); \ + int2 coord = (int2)(get_global_id(0), gidz); \ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \ + src_type src0; \ + dst_type dst; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f; \ + \ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + bias_f = read_imagef(bias, coord_para.xy); \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + float alpha = input_scale * output_scale * scale_vari; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + \ + CONVERT_INPUT_TO_F32() \ + norm = tmpData0 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData1 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + norm = tmpData2 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData3 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_8BITS_IMPL_2D(U8_F16toU8, vxc_uchar16, vxc_uchar16) +GROUP_NORM_8BITS_IMPL_2D(I8_F16toI8, vxc_char16, vxc_char16) + +#define GROUP_NORM_8BITS_F32_IMPL(name, src_type, dst_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + src_type src0; \ + dst_type dst; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f; \ + \ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + scale_f = read_imagef(scale, coord_para.xy); \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + float alpha = input_scale * output_scale * scale_vari; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + \ + CONVERT_INPUT_TO_F32() \ + norm = tmpData0 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData1 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + norm = tmpData2 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData3 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_8BITS_F32_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16) +GROUP_NORM_8BITS_F32_IMPL(I8_F32toI8, vxc_char16, vxc_char16) + +#define GROUP_NORM_8BITS_F32_IMPL_2D(name, src_type, dst_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidz = get_global_id(1); \ + int2 coord = (int2)(get_global_id(0), gidz); \ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \ + src_type src0; \ + dst_type dst; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f; \ + \ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + scale_f = read_imagef(scale, coord_para.xy); \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + float alpha = input_scale * output_scale * scale_vari; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + \ + CONVERT_INPUT_TO_F32() \ + norm = tmpData0 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData1 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + norm = tmpData2 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData3 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_8BITS_F32_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16) +GROUP_NORM_8BITS_F32_IMPL_2D(I8_F32toI8, vxc_char16, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx new file mode 100644 index 0000000..120e37e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx @@ -0,0 +1,233 @@ +#include "cl_viv_vx_ext.h" + + +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4; +_viv_uniform float input_scale; +_viv_uniform float input_zp; + +#define GROUP_NORM_8BITSTOF16_IMPL(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_array_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + src_type src0; \ + vxc_short8 src1, outval; \ + vxc_half8 scale_h, dst; \ + float scale_vari, bias_val; \ + vxc_float4 bias_f, scale_f; \ + \ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + bias_f = read_imagef(bias, coord_para.xy); \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + half4 tmpVal0, tmpVal1; \ + float alpha = scale_vari * input_scale; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \ + bias_val = bias_val - input_zp * alpha; \ + \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + norm = alpha * tmpData2 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData3 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_8BITSTOF16_IMPL(U8_F16toF16, vxc_uchar16) +GROUP_NORM_8BITSTOF16_IMPL(I8_F16toF16, vxc_char16) + + +#define GROUP_NORM_8BITSTOF16_IMPL_2D(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_array_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidz = get_global_id(1); \ + int2 coord = (int2)(get_global_id(0), gidz); \ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \ + src_type src0; \ + vxc_short8 src1, outval; \ + vxc_half8 scale_h, dst; \ + float scale_vari, bias_val; \ + vxc_float4 bias_f, scale_f; \ + \ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + bias_f = read_imagef(bias, coord_para.xy); \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + half4 tmpVal0, tmpVal1; \ + float alpha = scale_vari; \ + float alpha = scale_vari * input_scale; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \ + bias_val = bias_val - input_zp * alpha; \ + \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + norm = alpha * tmpData2 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData3 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_8BITSTOF16_IMPL_2D(U8_F16toF16, vxc_uchar16) +GROUP_NORM_8BITSTOF16_IMPL_2D(I8_F16toF16, vxc_char16) + +#define GROUP_NORM_8TOF16_F32_IMPL(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + src_type src0; \ + vxc_short8 outval; \ + vxc_half8 dst; \ + float scale_vari, bias_val; \ + vxc_float4 bias_f, scale_f; \ + \ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + scale_f = read_imagef(scale, coord_para.xy); \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + half4 tmpVal0, tmpVal1; \ + float alpha = scale_vari * input_scale; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \ + bias_val = bias_val - input_zp * alpha; \ + \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + norm = alpha * tmpData2 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData3 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_8TOF16_F32_IMPL(U8_F32toF16, vxc_uchar16) +GROUP_NORM_8TOF16_F32_IMPL(I8_F32toF16, vxc_char16) + +#define GROUP_NORM_8TOF16_F32_IMPL_2D(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidz = get_global_id(1); \ + int2 coord = (int2)(get_global_id(0), gidz); \ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \ + src_type src0; \ + vxc_short8 outval; \ + vxc_half8 dst; \ + float scale_vari, bias_val; \ + vxc_float4 bias_f, scale_f; \ + \ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + scale_f = read_imagef(scale, coord_para.xy); \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + half4 tmpVal0, tmpVal1; \ + float alpha = scale_vari * input_scale; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \ + bias_val = bias_val - input_zp * alpha; \ + \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + norm = alpha * tmpData2 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData3 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_8TOF16_F32_IMPL_2D(U8_F32toF16, vxc_uchar16) +GROUP_NORM_8TOF16_F32_IMPL_2D(I8_F32toF16, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx new file mode 100644 index 0000000..b62b67f --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_2.vx @@ -0,0 +1,347 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniSum_X_X2_8x2; +_viv_uniform float input_scale; +_viv_uniform float input_scale2; +_viv_uniform float input_zp; +_viv_uniform float sum_x_tail; +_viv_uniform float sum_x2_tail0; +_viv_uniform float sum_x2_tail1; + +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float eps, int is2D) \ +{ \ + int gidx = get_global_id(0) << 3; \ + int lidx = get_local_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(gidx, 0, gidz, 0); \ + vxc_short8 src0; \ + src_type in_h; \ + float4 sumsqr; \ + float4 tmpSumSqr = (float4)(0); \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + if(gidx < width) \ + { \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ + tmpSumSqr += sumsqr; \ + } \ + tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \ + tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \ + } \ + \ + lcl_sum[lidx] = tmpSumSqr.x; \ + lcl_sqr[lidx] = tmpSumSqr.y; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + float sum = 0; \ + float sqr = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + \ + float4 data = (float4)(sum, sqr, 0, 0); \ + write_imagef(output, coord_out, data); \ + } \ +} +GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8) +GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8) + +#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float eps, int is2D) \ +{ \ + int gidx = get_global_id(0) << 3; \ + int lidx = get_local_id(0); \ + \ + int2 coord = (int2)(gidx, get_global_id(1)); \ + vxc_short8 src0; \ + src_type in_h; \ + float4 sumsqr = (float4)(0); \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + \ + if(gidx < width) \ + { \ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ + sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \ + sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \ + } \ + \ + lcl_sum[lidx] = sumsqr.x; \ + lcl_sqr[lidx] = sumsqr.y; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + float sum = 0; \ + float sqr = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + \ + float4 data = (float4)(sum, sqr, 0, 0); \ + write_imagef(output, coord_out, data); \ + } \ +} +GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8) +GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8) + +#define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_array_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + vxc_short8 src0; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + src_type in_h; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f; \ + \ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + bias_f = read_imagef(bias, coord_para.xy); \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + float4 tmpData0, tmpData1; \ + copy_type outval; \ + conv_type tmpVal0, tmpVal1; \ + float alpha = input_scale * output_scale * scale_vari; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + dst_type dst; \ + \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + \ + float4 norm; \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV_RTE, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV_RTE, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_16BITS_IMPL(F16_F16toF16, vxc_half8, vxc_half8, vxc_short8, half4) +GROUP_NORM_16BITS_IMPL(F16_F16toI16, vxc_half8, vxc_short8, vxc_short8, int4) +GROUP_NORM_16BITS_IMPL(F16_F16toI8, vxc_half8, vxc_char8, vxc_char8, int4) +GROUP_NORM_16BITS_IMPL(F16_F16toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +GROUP_NORM_16BITS_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4) +GROUP_NORM_16BITS_IMPL(I16_F16toF16, vxc_short8, vxc_half8, vxc_short8, half4) + +#define GROUP_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_array_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidz = get_global_id(1); \ + int2 coord = (int2)(get_global_id(0), gidz); \ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \ + vxc_short8 src0; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + src_type in_h; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f; \ + \ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \ + bias_f = read_imagef(bias, coord_para.xy); \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + float4 tmpData0, tmpData1; \ + copy_type outval; \ + conv_type tmpVal0, tmpVal1; \ + float alpha = output_scale * scale_vari; \ + bias_val = input_scale * (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + dst_type dst; \ + \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + float4 norm; \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_16BITS_IMPL_2D(F16_F16toF16, vxc_half8, vxc_half8, vxc_short8, half4) +GROUP_NORM_16BITS_IMPL_2D(F16_F16toI16, vxc_half8, vxc_short8, vxc_short8, int4) +GROUP_NORM_16BITS_IMPL_2D(F16_F16toI8, vxc_half8, vxc_char8, vxc_char8, int4) +GROUP_NORM_16BITS_IMPL_2D(F16_F16toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +GROUP_NORM_16BITS_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4) +GROUP_NORM_16BITS_IMPL_2D(I16_F16toF16, vxc_short8, vxc_half8, vxc_short8, half4) + +#define GROUP_NORM_16BITS_F32_IMPL(name, src_type, dst_type, copy_type, conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidy = get_global_id(1); \ + int gidz = get_global_id(2); \ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \ + vxc_short8 src0; \ + src_type in_h; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f; \ + \ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + scale_f = read_imagef(scale, coord_para.xy); \ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + float4 tmpData0, tmpData1; \ + copy_type outval; \ + conv_type tmpVal0, tmpVal1; \ + float alpha = input_scale * output_scale * scale_vari; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + dst_type dst; \ + \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + \ + float4 norm; \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV_RTE, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_16BITS_F32_IMPL(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4) +GROUP_NORM_16BITS_F32_IMPL(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4) +GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4) +GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) +GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) + +#define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int is2D, float rSpaceOrg, int pStride) \ +{ \ + int gidz = get_global_id(1); \ + int2 coord = (int2)(get_global_id(0), gidz); \ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \ + vxc_short8 src0; \ + src_type in_h; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f; \ + \ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + scale_f = read_imagef(scale, coord_para.xy); \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + float4 tmpData0, tmpData1; \ + copy_type outval; \ + conv_type tmpVal0, tmpVal1; \ + float alpha = input_scale * output_scale * scale_vari; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + dst_type dst; \ + \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + float4 norm; \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4) +GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4) +GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4) +GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) +GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) + diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx deleted file mode 100644 index 161383d..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx +++ /dev/null @@ -1,306 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; -_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; - -_viv_uniform float outputScale; -_viv_uniform int output_ZP; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16( - image2d_array_t input, image2d_array_t output, float eps, int is2D) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); - vxc_short8 src0; - vxc_half8 in_h; - vxc_float4 sumsqr; - vxc_float4 tmpSumSqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - _viv_asm(COPY, in_h, src0, 16); - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - tmpSumSqr += sumsqr; - } - } - - lcl_sum[lidx] = tmpSumSqr.x; - lcl_sqr[lidx] = tmpSumSqr.y; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - float sum = 0; - float sqr = 0; - for(int i = 0; i < 4; i++) - { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16_2D( - image2d_array_t input, image2d_array_t output, float eps, int is2D) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - - int2 coord = (int2)(gidx, get_global_id(1)); - vxc_short8 src0; - vxc_half8 in_h; - vxc_float4 sumsqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - if(gidx < width) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, in_h, src0, 16); - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - } - - lcl_sum[lidx] = sumsqr.x; - lcl_sqr[lidx] = sumsqr.y; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - float sum = 0; - float sqr = 0; - for(int i = 0; i < 4; i++) - { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, - float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - - vxc_float4 norm; - norm = scale_vari * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_vari * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, - float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - vxc_float4 norm; - norm = scale_vari * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_vari * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, - float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_uchar16 outval; - vxc_int4 tmpVal0, tmpVal1; - float alpha = outputScale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = alpha * tmpData1 + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, - float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_uchar16 outval; - vxc_int4 tmpVal0, tmpVal1; - float alpha = outputScale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = alpha * tmpData1 + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx deleted file mode 100644 index cb00ac9..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx +++ /dev/null @@ -1,174 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; - -_viv_uniform float outputScale; -_viv_uniform int output_ZP; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, - float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_short8 src0; - vxc_half8 in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - - vxc_float4 norm; - norm = scale_vari * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_vari * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, - float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_short8 src0; - vxc_half8 in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - vxc_float4 norm; - norm = scale_vari * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_vari * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, - float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_short8 src0; - vxc_half8 in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_uchar16 outval; - vxc_int4 tmpVal0, tmpVal1; - float alpha = outputScale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = alpha * tmpData1 + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, - float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_short8 src0; - vxc_half8 in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_uchar16 outval; - vxc_int4 tmpVal0, tmpVal1; - float alpha = outputScale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = alpha * tmpData1 + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx deleted file mode 100644 index 1282e00..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx +++ /dev/null @@ -1,339 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; - -_viv_uniform float inFlScale_s2; -_viv_uniform float input_fl_scale; -_viv_uniform float inOut_fl_scale; -_viv_uniform float output_fl_scale; - -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; -_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4; -_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16( - image2d_array_t input, - image2d_array_t output, - float eps, - int is2D) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); - vxc_short8 src0; - float sum = 0, sqr = 0; - vxc_float4 sumsqr = (vxc_float4)(0); - vxc_float4 tmpSumSqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - //tmpSumSqr += sumsqr; - tmpSumSqr.x += sumsqr.x; - sqr += (sumsqr.y * inFlScale_s2); - } - sum = tmpSumSqr.x * input_fl_scale; - //sqr = tmpSumSqr.y * inFlScale_s2; - } - - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16_2D( - image2d_array_t input, - image2d_array_t output, - float eps, - int is2D) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - - int2 coord = (int2)(gidx, gidz); - vxc_short8 src0; - float sum = 0, sqr = 0; - vxc_float4 sumsqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - if(gidx < width) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - sqr = sumsqr.y * inFlScale_s2; - sum = sumsqr.x * input_fl_scale; - //sqr = tmpSumSqr.y * inFlScale_s2; - } - - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16( - image2d_array_t input, - image2d_t bias, - image2d_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int is2D, - float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16_2D( - image2d_array_t input, - image2d_t bias, - image2d_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int is2D, - float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16( - image2d_array_t input, - image2d_t bias, - image2d_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int is2D, - float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_short8 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toInt16_2x8); - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16_2D( - image2d_array_t input, - image2d_t bias, - image2d_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int is2D, - float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_short8 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toInt16_2x8); - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx deleted file mode 100644 index 397a5f8..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx +++ /dev/null @@ -1,191 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; - -_viv_uniform float input_fl_scale; -_viv_uniform float inOut_fl_scale; -_viv_uniform float output_fl_scale; - -_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4; -_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8; - - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16( - image2d_array_t input, - image2d_t bias, - image2d_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int is2D, - float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_short8 src0; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16_2D( - image2d_array_t input, - image2d_t bias, - image2d_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int is2D, - float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_short8 src0; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16( - image2d_array_t input, - image2d_t bias, - image2d_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int is2D, - float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_short8 src0, src2; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toInt16_2x8); - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16_2D( - image2d_array_t input, - image2d_t bias, - image2d_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int is2D, - float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_short8 src0, src2; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toInt16_2x8); - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx deleted file mode 100644 index 6a407a3..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx +++ /dev/null @@ -1,317 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniSumInt8_16x1; -_viv_uniform VXC_512Bits uniSqrSumInt8_16x1; -_viv_uniform float inFlScale_s2; -_viv_uniform float input_fl_scale; - -_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4; - -_viv_uniform float inOut_fl_scale; -_viv_uniform float output_fl_scale; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8( - image2d_array_t input, image2d_array_t output, float eps, int is2D) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); - vxc_char16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1); - tmpSum += (tmpSum1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1); - tmpSqr += (tmpSqr1); - } - sqr = tmpSqr * inFlScale_s2; - sum = tmpSum * input_fl_scale; - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8_2D( - image2d_array_t input, image2d_array_t output, float eps, int is2D) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - - int2 coord = (int2)(gidx, gidz); - vxc_char16 src0; - float sum = 0, sqr = 0; - int tmpSum1, tmpSqr1; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - if(gidx < width) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1); - sqr = tmpSqr1 * inFlScale_s2; - sum = tmpSum1 * input_fl_scale; - } - - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_char16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_char16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_char16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_char16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx deleted file mode 100644 index 350e425..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx +++ /dev/null @@ -1,186 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform float input_fl_scale; - -_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4; - -_viv_uniform float inOut_fl_scale; -_viv_uniform float output_fl_scale; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_char16 src0; - vxc_short8 outval; - vxc_half8 dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_char16 src0; - vxc_short8 outval; - vxc_half8 dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_char16 src0, src2; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_char16 src0, src2; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx deleted file mode 100644 index c08a996..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx +++ /dev/null @@ -1,342 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniSumU8_16x1; -_viv_uniform VXC_512Bits uniSqrSum_16x1; -_viv_uniform float input_scale; -_viv_uniform int inputZP; -_viv_uniform int sumInZp; -_viv_uniform int tmpZp1; -_viv_uniform float e2InScale; -_viv_uniform float rowSumScale; -_viv_uniform float scale_inOut; -_viv_uniform float outputScale; -_viv_uniform int output_ZP; - -_viv_uniform VXC_512Bits uniResetFp32_4x4; -_viv_uniform int group_stride; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8( - image2d_array_t input, image2d_array_t output, float eps, int is2D) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); - vxc_uchar16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); - } - sqr += (tmpSqr * e2InScale + rowSumScale); - sum = (tmpSum + sumInZp) * input_scale; - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8_2D( - image2d_array_t input, image2d_array_t output, float eps, int is2D) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - - int2 coord = (int2)(gidx, get_global_id(1)); - vxc_uchar16 src0; - float sum = 0, sqr = 0; - int tmpSqr, tmpSum1, tmpSqr1; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - if(gidx < width) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr = tmpSqr1 + tmpZp1 * tmpSum1; - sqr = (tmpSqr * e2InScale + rowSumScale); - sum = (tmpSum1 + sumInZp) * input_scale; - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvari( - image2d_t input, image2d_t output, float eps, float group_ratio) -{ - int gidx = get_global_id(0); - int lidx = get_local_id(0); - - int2 coord = (int2)(gidx, get_global_id(1)); - vxc_uchar16 src0; - float2 sum_sqr = (float2)(0); - vxc_float4 mean_vari; - VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4); - - __local float2 lcl_data[16]; - __local float2 lcl_sum[4]; - - for(; coord.x < group_stride; coord.x += 64) - { - mean_vari += read_imagef(input, coord); - } - lcl_data[lidx] = mean_vari.xy; - barrier(CLK_LOCAL_MEM_FENCE); - if(lidx < 4) - { - float2 tmpSum = (float2)(0); - for(int i = lidx; i < 16; i+=4) - { - tmpSum += lcl_data[i]; - } - lcl_sum[lidx] = tmpSum; - } - barrier(CLK_LOCAL_MEM_FENCE); - if(lidx == 0) - { - for(int i = 0; i < 4; i++) - { - sum_sqr += lcl_sum[i]; - } - mean_vari.xy = sum_sqr * group_ratio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - coord.x = 0; - write_imagef(output, coord, mean_vari); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_uchar16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = scale_inOut * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_uchar16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = scale_inOut * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_uchar16 src0, src2; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = scale_inOut * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_uchar16 src0, src2; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = scale_inOut * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx deleted file mode 100644 index a1f4ce0..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx +++ /dev/null @@ -1,207 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; - -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform float input_scale; -_viv_uniform int inputZP; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_uchar16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_uchar16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidy = get_global_id(1); - int gidz = get_global_id(2); - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); - vxc_uchar16 src0; - vxc_short8 outval; - vxc_half8 dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride) -{ - int gidz = get_global_id(1); - int2 coord = (int2)(get_global_id(0), gidz); - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); - vxc_uchar16 src0; - vxc_short8 outval; - vxc_half8 dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f; - - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); - bias_f = read_imagef(bias, coord_para.xy); - scale_f = read_imagef(scale, coord_para.xy); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx index c1266fc..fce0623 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx @@ -72,7 +72,8 @@ __kernel void grucell_activation_z_h_F16_F16toF16_##act_name( \ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } -GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func) +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func) +GRUCELL_F16_F16TOF16(HSIGMOID, hard_sigmoid) _viv_uniform float hstate_in_scale; _viv_uniform float hstate_in_tail; @@ -121,6 +122,9 @@ __kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } -GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8) -GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8) -GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8) +GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8) +GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8) +GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8) +GRUCELL_QNT_F16TO_QNT(U8, U8, HSIGMOID, hard_sigmoid, vxc_uchar8, vxc_uchar8) +GRUCELL_QNT_F16TO_QNT(I8, I8, HSIGMOID, hard_sigmoid, vxc_char8, vxc_char8) +GRUCELL_QNT_F16TO_QNT(I16, I16, HSIGMOID, hard_sigmoid, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx index a9c8d44..1a037de 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx @@ -54,7 +54,8 @@ __kernel void grucell_h_times_activation_r_F16_F16toF16_##act_name( \ _viv_asm(COPY, dst, dst1, 8); \ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } -GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func) +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func) +GRUCELL_F16_F16TOF16(HSIGMOID, hard_sigmoid) _viv_uniform float hstate_in_scale; _viv_uniform float hstate_in_tail; @@ -91,6 +92,9 @@ __kernel void grucell_h_times_activation_r_##name0##_F16toF16_##act_name( \ _viv_asm(COPY, dst, dst1, 8); \ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ } -GRUCELL_QNT_F16TO_F16(U8, SIGMOID, sigmoid_func, vxc_uchar8) -GRUCELL_QNT_F16TO_F16(I8, SIGMOID, sigmoid_func, vxc_char8) -GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8) +GRUCELL_QNT_F16TO_F16(U8, SIGMOID, sigmoid_func, vxc_uchar8) +GRUCELL_QNT_F16TO_F16(I8, SIGMOID, sigmoid_func, vxc_char8) +GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8) +GRUCELL_QNT_F16TO_F16(U8, HSIGMOID, hard_sigmoid, vxc_uchar8) +GRUCELL_QNT_F16TO_F16(I8, HSIGMOID, hard_sigmoid, vxc_char8) +GRUCELL_QNT_F16TO_F16(I16, HSIGMOID, hard_sigmoid, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx new file mode 100644 index 0000000..1644ecd --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx @@ -0,0 +1,268 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float inv_multiplier; +_viv_uniform int group_num; + +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniSum_X_X2_16x2; +_viv_uniform float input_scale; +_viv_uniform float input_scale2; +_viv_uniform float input_zp; +_viv_uniform float sum_x_tail; +_viv_uniform float sum_x2_tail0; +_viv_uniform float sum_x2_tail1; +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +_viv_uniform VXC_512Bits uniSumX_16x1; +_viv_uniform VXC_512Bits uniSumX2_16x1; + +#define INSTANCE_NORM_SUMS_8BITS_IMPL(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(gidx, 0, gidz, gidz); \ + src_type src0; \ + float2 sums_f32 = 0; \ + int2 sums = 0, sum_x_x2; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + if(gidx < width) \ + { \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + sums = sums + sum_x_x2; \ + } \ + sums_f32 = convert_float2(sums); \ + sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \ + sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \ + } \ + lcl_sum[lidx] = sums_f32.x; \ + lcl_sqr[lidx] = sums_f32.y; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + float sum = 0, sqr = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + float4 data = (float4)(sum, sqr, 0, 0); \ + write_imagef(output, coord_out, data); \ + } \ +} +INSTANCE_NORM_SUMS_8BITS_IMPL(U8, vxc_uchar16) +INSTANCE_NORM_SUMS_8BITS_IMPL(I8, vxc_char16) + +#define INSTANCE_NORM_SUMS_8BITS_IMPL_2D(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidx = get_global_id(0) << 4; \ + int lidx = get_local_id(0); \ + int gidz = get_global_id(1); \ + int gidy = gidz * height; \ + \ + int2 coord = (int2)(gidx, gidy); \ + src_type src0; \ + float2 sums_f32 = 0; \ + int2 sums = 0, sum_x_x2; \ + int endH = gidy + height; \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + if (gidx < width) \ + { \ + for(; coord.y < endH;) \ + { \ + VXC_ReadImage(src0, input, coord, 0, \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + sums = sums + sum_x_x2; \ + } \ + sums_f32 = convert_float2(sums); \ + sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \ + sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \ + } \ + lcl_sum[lidx] = sums_f32.x; \ + lcl_sqr[lidx] = sums_f32.y; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + float sum = 0, sqr = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + float4 data = (float4)(sum, sqr, 0, 0); \ + write_imagef(output, coord_out, data); \ + } \ +} +INSTANCE_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16) +INSTANCE_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16) + +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4; +#define INSTANCE_NORM_8BITS_IMPL(name, src_type, dst_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ + int2 coord_para = (int2)(0, gidz); \ + src_type src0; \ + dst_type dst; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f, mean_vari = (float4)(0); \ + \ + scale_f = read_imagef(scale, coord_para); \ + bias_f = read_imagef(bias, coord_para); \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_para); \ + coord_para.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + float alpha = input_scale * output_scale * scale_vari; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.z, baseAddr_a); \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr); \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.y ++; \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ + norm = tmpData0 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData1 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + norm = tmpData2 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData3 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_OP4_NoDest(img_store_3d, output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +INSTANCE_NORM_8BITS_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16) +INSTANCE_NORM_8BITS_IMPL(I8_F32toI8, vxc_char16, vxc_char16) + +#define INSTANCE_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidz = get_global_id(1); \ + int gidy = gidz * height; \ + int2 coord = (int2)(get_global_id(0), gidy); \ + int2 coord_para = (int2)(0, gidz); \ + int endH = gidy + height; \ + src_type src0; \ + dst_type dst; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f, mean_vari = (float4)(0); \ + \ + scale_f = read_imagef(scale, coord_para); \ + bias_f = read_imagef(bias, coord_para); \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_para); \ + coord_para.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + vxc_int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + float alpha = input_scale * output_scale * scale_vari; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + \ + for(; coord.y < endH; coord.y++) \ + { \ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ + norm = tmpData0 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData1 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + norm = tmpData2 * alpha + bias_val; \ + tmpVal0 = convert_int4_rte(norm); \ + norm = tmpData3 * alpha + bias_val; \ + tmpVal1 = convert_int4_rte(norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +INSTANCE_NORM_8BITS_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16) +INSTANCE_NORM_8BITS_IMPL_2D(I8_F32toI8, vxc_char16, vxc_char16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx new file mode 100644 index 0000000..82d1704 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx @@ -0,0 +1,154 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float inv_multiplier; +_viv_uniform int group_num; +_viv_uniform float input_scale; +_viv_uniform float input_zp; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4; + +#define INSTANCE_NORM_8_TO_F16_IMPL(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_para = (int4)(0, gidz, 0, 0); \ + src_type src0; \ + vxc_short8 outval; \ + vxc_half8 dst; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f, mean_vari = (float4)(0); \ + \ + scale_f = read_imagef(scale, coord_para.xy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_para.xy); \ + coord_para.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + half4 tmpVal0, tmpVal1; \ + float alpha = scale_vari * input_scale; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \ + bias_val = bias_val - input_zp * alpha; \ + \ + coord_para = coord; \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_para.z, baseAddr); \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, \ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_para.xy = coord.xy; \ + coord.y++; \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + coord_para.x += 8; \ + norm = alpha * tmpData2 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData3 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +INSTANCE_NORM_8_TO_F16_IMPL(U8_F32toF16, vxc_uchar16) +INSTANCE_NORM_8_TO_F16_IMPL(I8_F32toF16, vxc_char16) + +#define INSTANCE_NORM_8_TO_F16_IMPL_2D(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidz = get_global_id(1); \ + int gidy = gidz * height; \ + int4 coord = (int4)(get_global_id(0), gidy, 0, 0); \ + int4 coord_para = (int4)(0, gidz, 0, 0); \ + int endH = gidy + height; \ + src_type src0; \ + vxc_short8 outval; \ + vxc_half8 dst; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f, mean_vari = (float4)(0); \ + \ + scale_f = read_imagef(scale, coord_para.xy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_para.xy); \ + coord_para.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ + half4 tmpVal0, tmpVal1; \ + float alpha = scale_vari * input_scale; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \ + bias_val = bias_val - input_zp * alpha; \ + for(; coord.y < endH;) \ + { \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_para = coord; \ + coord.y++; \ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_para.x += 8; \ + norm = alpha * tmpData2 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData3 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +INSTANCE_NORM_8_TO_F16_IMPL_2D(U8_F32toF16, vxc_uchar16) +INSTANCE_NORM_8_TO_F16_IMPL_2D(I8_F32toF16, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx new file mode 100644 index 0000000..75221f4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx @@ -0,0 +1,285 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int width; +_viv_uniform int height; +_viv_uniform float inv_multiplier; +_viv_uniform int group_num; +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform VXC_512Bits uniSum_X_X2_8x2; +_viv_uniform float input_scale; +_viv_uniform float input_scale2; +_viv_uniform float input_zp; +_viv_uniform float sum_x_tail; +_viv_uniform float sum_x2_tail0; +_viv_uniform float sum_x2_tail1; + +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define INSTANCE_NORM_SUMS_16BITS_IMPL(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidx = get_global_id(0) << 3; \ + int lidx = get_local_id(0); \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(gidx, 0, gidz, gidz); \ + vxc_short8 src0; \ + src_type in_h; \ + float4 sumsqr; \ + float4 tmpSumSqr = (float4)(0); \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + \ + int8 input_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + if(gidx < width) \ + { \ + for(coord.y = 0; coord.y < height;) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \ + uniSum_X_X2_8x2); \ + tmpSumSqr += sumsqr; \ + } \ + tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \ + tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \ + } \ + \ + lcl_sum[lidx] = tmpSumSqr.x; \ + lcl_sqr[lidx] = tmpSumSqr.y; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + float sum = 0; \ + float sqr = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + \ + float4 data = (float4)(sum, sqr, 0, 0); \ + write_imagef(output, coord_out, data); \ + } \ +} +INSTANCE_NORM_SUMS_16BITS_IMPL(F16, vxc_half8) +INSTANCE_NORM_SUMS_16BITS_IMPL(I16, vxc_short8) + +#define INSTANCE_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \ + __read_only image2d_array_t input, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidx = get_global_id(0) << 3; \ + int lidx = get_local_id(0); \ + int gidz = get_global_id(1); \ + int gidy = gidz * height; \ + \ + int2 coord = (int2)(gidx, gidy); \ + vxc_short8 src0; \ + src_type in_h; \ + float4 sumsqr; \ + float4 tmpSumSqr = (float4)(0); \ + \ + __local float lcl_sum[16]; \ + __local float lcl_sqr[16]; \ + \ + int endH = gidy + height; \ + if(gidx < width) \ + { \ + for(; coord.y < endH;) \ + { \ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.y++; \ + _viv_asm(COPY, in_h, src0, 16); \ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \ + uniSum_X_X2_8x2); \ + tmpSumSqr += sumsqr; \ + } \ + tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \ + tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \ + } \ + \ + lcl_sum[lidx] = tmpSumSqr.x; \ + lcl_sqr[lidx] = tmpSumSqr.y; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + \ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \ + if(lidx == 0) \ + { \ + float4 one = (float4)(1, 1, 1, 1); \ + __local float4* tmp_sum = (__local float4*)lcl_sum; \ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \ + \ + float sum = 0; \ + float sqr = 0; \ + for(int i = 0; i < 4; i++) \ + { \ + sum += dot(tmp_sum[i], one); \ + sqr += dot(tmp_sqr[i], one); \ + } \ + \ + float4 data = (float4)(sum, sqr, 0, 0); \ + write_imagef(output, coord_out, data); \ + } \ +} +INSTANCE_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8) +INSTANCE_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8) + +#define INSTANCE_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidz = get_global_id(1); \ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ + int4 coord_para = (int4)(0, gidz, 0, 0); \ + vxc_short8 src0; \ + src_type in_h; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f, mean_vari = (float4)(0); \ + \ + scale_f = read_imagef(scale, coord_para.xy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_para.xy); \ + coord_para.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + float alpha = input_scale * output_scale * scale_vari; \ + float4 tmpData0, tmpData1; \ + copy_type outval; \ + conv_type tmpVal0, tmpVal1; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + dst_type dst; \ + \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord_in.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr); \ + \ + for(coord.y = 0; coord.y < height; coord.y++) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, in_h, src0, 16); \ + \ + coord_in.y ++; \ + \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + \ + float4 norm; \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV_RTE, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV_RTE, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +INSTANCE_NORM_16BITS_IMPL(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4) +INSTANCE_NORM_16BITS_IMPL(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4) +INSTANCE_NORM_16BITS_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4) +INSTANCE_NORM_16BITS_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +INSTANCE_NORM_16BITS_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) +INSTANCE_NORM_16BITS_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) + +#define INSTANCE_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __read_only image2d_t meanVari, \ + __write_only image2d_array_t output, \ + float eps, int rs_flag) \ +{ \ + int gidz = get_global_id(1); \ + int gidy = gidz * height; \ + int4 coord = (int4)(get_global_id(0), gidy, 0, 0); \ + int4 coord_para = (int4)(0, gidz, 0, 0); \ + int endH = gidy + height; \ + vxc_short8 src0; \ + src_type in_h; \ + float scale_vari, bias_val; \ + float4 bias_f, scale_f, mean_vari = (float4)(0); \ + \ + scale_f = read_imagef(scale, coord_para.xy); \ + bias_f = read_imagef(bias, coord_para.xy); \ + \ + for(int i = 0; i < group_num; i++) \ + { \ + mean_vari += read_imagef(meanVari, coord_para.xy); \ + coord_para.x += 4; \ + } \ + mean_vari *= inv_multiplier; \ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ + mean_vari.s1 = rsqrt(mean_vari.s1); \ + \ + scale_vari = scale_f.s0 * mean_vari.s1; \ + float alpha = input_scale * output_scale * scale_vari; \ + float4 tmpData0, tmpData1; \ + copy_type outval; \ + conv_type tmpVal0, tmpVal1; \ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \ + bias_val = bias_val - input_zp * alpha; \ + dst_type dst; \ + \ + for(; coord.y < endH; coord.y++) \ + { \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, in_h, src0, 16); \ + \ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \ + float4 norm; \ + norm = alpha * tmpData0 + bias_val; \ + _viv_asm(CONV, tmpVal0, norm); \ + norm = alpha * tmpData1 + bias_val; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \ + _viv_asm(COPY, outval, dst, 16); \ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4) +INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4) +INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4) +INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) +INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx similarity index 87% rename from src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx rename to src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx index bba8627..19f335b 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx @@ -2,16 +2,13 @@ _viv_uniform int width; _viv_uniform int height; -_viv_uniform float dimRatio; +_viv_uniform float inv_multiplier; _viv_uniform int group_num; _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; _viv_uniform VXC_512Bits uniExtractOddData_2x8; -constant vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); -constant float4 one = (float4)(1.0, 1.0, 1.0, 1.0); - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16( +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16( image2d_array_t input, image2d_array_t output, float eps, int rsFlg) { int gidx = get_global_id(0) << 3; @@ -20,8 +17,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int4 coord = (int4)(gidx, 0, gidz, gidz); vxc_short8 src0, src1, src2; float4 srcA, srcB; - vxc_float sum = 0, sqr = 0; - + float sum = 0, sqr = 0; + float4 one = (float4)(1.0, 1.0, 1.0, 1.0); + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); __local float lcl_sum[16]; __local float lcl_sqr[16]; @@ -71,7 +69,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean } } -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16_2D( +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16_2D( image2d_array_t input, image2d_array_t output, float eps, int rsFlg) { int gidx = get_global_id(0) << 3; @@ -82,7 +80,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int2 coord = (int2)(gidx, gidy); vxc_short8 src0, src1, src2; float4 srcA, srcB; - vxc_float sum = 0, sqr = 0; + float sum = 0, sqr = 0; + float4 one = (float4)(1.0, 1.0, 1.0, 1.0); + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); __local float lcl_sum[16]; __local float lcl_sqr[16]; @@ -129,7 +129,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean } } -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16( +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16_F32toBF16( image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output, float eps, int rsFlg) { @@ -138,30 +138,26 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); vxc_short8 src0, src1, src2; float scale_vari, bias_val; - vxc_float4 mean_vari = (vxc_float4)(0); + float4 mean_vari = (float4)(0); - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); Image img3 = create_image_from_image2d(meanVari, 4); - __global float* bias_ptr = (__global float*)img1.ptr; - __global float* scal_ptr = (__global float*)img2.ptr; __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz)); __global float4* vari_ptr = (__global float4*)sumVari_ptr; - float bval = bias_ptr[gidz]; - float sval = scal_ptr[gidz]; + float sval = read_imagef(scale, coord.yz).x; + float bval = read_imagef(bias, coord.yz).x; for(int i = 0; i < group_num; i++) { mean_vari += vari_ptr[i]; } - mean_vari *= dimRatio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = sval * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; + float4 tmpData0, tmpData1; bias_val = (bval - scale_vari * mean_vari.s0); int8 input_desc, output_desc; @@ -185,7 +181,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 _viv_asm(COPY, tmpData0, src1, 16); _viv_asm(COPY, tmpData1, src2, 16); - vxc_float4 norm; + float4 norm; norm = scale_vari * tmpData0 + bias_val; _viv_asm(COPY, src0, norm, 16); norm = scale_vari * tmpData1 + bias_val; @@ -207,30 +203,26 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 int endH = gidy + height; vxc_short8 src0, src1, src2; float scale_vari, bias_val; - vxc_float4 mean_vari = (vxc_float4)(0); + float4 mean_vari = (float4)(0); - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); Image img3 = create_image_from_image2d(meanVari, 4); - __global float* bias_ptr = (__global float*)img1.ptr; - __global float* scal_ptr = (__global float*)img2.ptr; __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); __global float4* vari_ptr = (__global float4*)sumVari_ptr; - float bval = bias_ptr[gidz]; - float sval = scal_ptr[gidz]; + float sval = read_imagef(scale, coord_para.yx).x; + float bval = read_imagef(bias, coord_para.yx).x; for(int i = 0; i < group_num; i++) { mean_vari += vari_ptr[i]; } - mean_vari *= dimRatio; + mean_vari *= inv_multiplier; mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; mean_vari.s1 = rsqrt(mean_vari.s1); scale_vari = sval * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; + float4 tmpData0, tmpData1; bias_val = (bval - scale_vari * mean_vari.s0); for(; coord.y < endH; coord.y++) @@ -244,7 +236,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 _viv_asm(COPY, tmpData0, src1, 16); _viv_asm(COPY, tmpData1, src2, 16); - vxc_float4 norm; + float4 norm; norm = scale_vari * tmpData0 + bias_val; _viv_asm(COPY, src0, norm, 16); norm = scale_vari * tmpData1 + bias_val; diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx deleted file mode 100644 index 2fd2d44..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx +++ /dev/null @@ -1,259 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; - -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; -_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16( - image2d_array_t input, image2d_array_t output, float eps, int rsFlg) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, gidz); - vxc_short8 src0; - vxc_half8 in_h; - vxc_float4 sumsqr; - vxc_float4 tmpSumSqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - _viv_asm(COPY, in_h, src0, 16); - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - tmpSumSqr += sumsqr; - } - } - - lcl_sum[lidx] = tmpSumSqr.x; - lcl_sqr[lidx] = tmpSumSqr.y; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - float sum = 0; - float sqr = 0; - for(int i = 0; i < 4; i++) - { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D( - image2d_array_t input, image2d_array_t output, float eps, int rsFlg) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int gidy = gidz * height; - - int2 coord = (int2)(gidx, gidy); - vxc_short8 src0; - vxc_half8 in_h; - vxc_float4 sumsqr; - vxc_float4 tmpSumSqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int endH = gidy + height; - if(gidx < width) - { - for(; coord.y < endH;) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - _viv_asm(COPY, in_h, src0, 16); - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - tmpSumSqr += sumsqr; - } - } - - lcl_sum[lidx] = tmpSumSqr.x; - lcl_sqr[lidx] = tmpSumSqr.y; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - float sum = 0; - float sqr = 0; - for(int i = 0; i < 4; i++) - { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_para = (int4)(gidz, 0, 0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - bias_f = read_imagef(bias, coord_para); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, in_h, src0, 16); - - coord_in.y ++; - - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - - vxc_float4 norm; - norm = scale_vari * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_vari * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int4 coord = (int4)(get_global_id(0), gidy, 0, 0); - int4 coord_para = (int4)(gidz, 0, 0, 0); - int endH = gidy + height; - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - bias_f = read_imagef(bias, coord_para); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - for(; coord.y < endH; coord.y++) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, in_h, src0, 16); - - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - vxc_float4 norm; - norm = scale_vari * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_vari * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx deleted file mode 100644 index fa5538c..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx +++ /dev/null @@ -1,416 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; - -_viv_uniform float inFlScale_s2; -_viv_uniform float input_fl_scale; -_viv_uniform float inOut_fl_scale; -_viv_uniform float output_fl_scale; - -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; -_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4; -_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16( - image2d_array_t input, - image2d_array_t output, - float eps, - int rsFlg) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, gidz); - vxc_short8 src0; - float sum = 0, sqr = 0; - vxc_float4 sumsqr = (vxc_float4)(0); - vxc_float4 tmpSumSqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, 0, \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - //tmpSumSqr += sumsqr; - tmpSumSqr.x += sumsqr.x; - sqr += (sumsqr.y * inFlScale_s2); - } - sum = tmpSumSqr.x * input_fl_scale; - //sqr = tmpSumSqr.y * inFlScale_s2; - } - - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16_2D( - image2d_array_t input, - image2d_array_t output, - float eps, - int rsFlg) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int gidy = gidz * height; - - int2 coord = (int2)(gidx, gidy); - vxc_short8 src0; - float sum = 0, sqr = 0; - vxc_float4 sumsqr = (vxc_float4)(0); - vxc_float4 tmpSumSqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int endH = gidy + height; - if(gidx < width) - { - for(; coord.y < endH;) - { - VXC_ReadImage(src0, input, coord, 0, - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - //tmpSumSqr += sumsqr; - tmpSumSqr.x += sumsqr.x; - sqr += (sumsqr.y * inFlScale_s2); - } - sum = tmpSumSqr.x * input_fl_scale; - //sqr = tmpSumSqr.y * inFlScale_s2; - } - - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - //sum += lcl_sum[i]; - //sqr += lcl_sqr[i]; - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_para = (int4)(gidz, 0, 0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, 0,\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, 0, \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16_2D( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int4 coord = (int4)(get_global_id(0), gidy, 0, 0); - int4 coord_para = (int4)(gidz, 0, 0, 0); - int endH = gidy + height; - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, 0,\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - vxc_half8 dst; - - for(; coord.y < endH; coord.y++) - { - VXC_ReadImage(src0, input, coord.xy, 0,\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_para = (int4)(gidz, 0, 0, 0); - vxc_short8 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, 0,\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, 0, \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toInt16_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord, src2, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16_2D( - image2d_array_t input, - image2d_array_t bias, - image2d_array_t scale, - image2d_t meanVari, - image2d_array_t output, - float eps, - int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int2 coord = (int2)(get_global_id(0), gidy); - int4 coord_para = (int4)(gidz, 0, 0, 0); - int endH = gidy + height; - vxc_short8 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, 0,\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - for(; coord.y < endH; coord.y++) - { - VXC_ReadImage(src0, input, coord, 0,\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toInt16_2x8); - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx deleted file mode 100644 index a6c98ef..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx +++ /dev/null @@ -1,397 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniSumInt8_16x1; -_viv_uniform VXC_512Bits uniSqrSumInt8_16x1; -_viv_uniform float inFlScale_s2; -_viv_uniform float input_fl_scale; - -_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4; - -_viv_uniform float inOut_fl_scale; -_viv_uniform float output_fl_scale; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8( - image2d_array_t input, image2d_array_t output, float eps, int rsFlg) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, gidz); - vxc_char16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1); - tmpSum += (tmpSum1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1); - tmpSqr += (tmpSqr1); - } - sqr = tmpSqr * inFlScale_s2; - sum = tmpSum * input_fl_scale; - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D( - image2d_array_t input, image2d_array_t output, float eps, int rsFlg) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int gidy = gidz * height; - - int2 coord = (int2)(gidx, gidy); - vxc_char16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int endH = gidy + height; - if(gidx < width) - { - for(; coord.y < endH;) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1); - tmpSum += (tmpSum1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1); - tmpSqr += (tmpSqr1); - } - sqr = tmpSqr * inFlScale_s2; - sum = tmpSum * input_fl_scale; - } - - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_para = (int4)(gidz, 0, 0, 0); - vxc_char16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - bias_f = read_imagef(bias, coord_para); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - coord_para = coord; - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_para.z, baseAddr); - - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_para.xy = coord.xy; - coord.y++; - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_para.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int4 coord = (int4)(get_global_id(0), gidy, 0, 0); - int4 coord_para = (int4)(gidz, 0, 0, 0); - int endH = gidy + height; - vxc_char16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - for(; coord.y < endH;) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_para = coord; - coord.y++; - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_para.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_para = (int4)(gidz, 0, 0, 0); - vxc_char16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int2 coord = (int2)(get_global_id(0), gidy); - int4 coord_para = (int4)(gidz, 0, 0, 0); - int endH = gidy + height; - vxc_char16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale; - - for(; coord.y < endH; coord.y++) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4); - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4); - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - } -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx deleted file mode 100644 index b81a1a1..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx +++ /dev/null @@ -1,289 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int height; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform int inputZP; -_viv_uniform float scale_inOut; -_viv_uniform float outputScale; -_viv_uniform int output_ZP; -_viv_uniform float inOut_fl_scale; -_viv_uniform float output_fl_scale; -_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4; -_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8; - -#define INSTANCENORM_8BITS_F32(src1_type_name, read_type) \ -__kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \ - image2d_array_t output, float eps, int rsFlg) \ -{ \ - int gidz = get_global_id(1); \ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \ - int2 coord_para = (int2)(gidz, 0); \ - read_type src0, src2; \ - float scale_vari, bias_val; \ - vxc_float4 mean_vari = (vxc_float4)(0); \ - \ - Image img1 = create_image_from_image2d(bias, 4); \ - Image img2 = create_image_from_image2d(scale, 4); \ - Image img3 = create_image_from_image2d(meanVari, 4); \ - __global float* bias_ptr = (__global float*)img1.ptr; \ - __global float* scal_ptr = (__global float*)img2.ptr; \ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \ - __global float4* vari_ptr = (__global float4*)sumVari_ptr; \ - \ - float bval = bias_ptr[gidz]; \ - float sval = scal_ptr[gidz]; \ - \ - for(int i = 0; i < group_num; i++) \ - { \ - mean_vari += vari_ptr[i]; \ - } \ - mean_vari *= dimRatio; \ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ - mean_vari.s1 = rsqrt(mean_vari.s1); \ - \ - scale_vari = sval * mean_vari.s1; \ - short zp = inputZP; \ - vxc_int4 tmpVal0, tmpVal1; \ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ - float alpha = scale_inOut * scale_vari; \ - bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \ - \ - int8 input_desc, output_desc; \ - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \ - _viv_asm(MOV, coord_in.z, baseAddr_a); \ - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \ - _viv_asm(MOV, coord.z, baseAddr); \ - \ - for(coord.y = 0; coord.y < height; coord.y++) \ - { \ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - coord_in.y ++; \ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ - uniConvert1stUint8SubZpToFp32_4x4); \ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ - uniConvert2ndUint8SubZpToFp32_4x4); \ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ - uniConvert3rdUint8SubZpToFp32_4x4); \ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ - uniConvert4thUint8SubZpToFp32_4x4); \ - norm = tmpData0 * alpha + bias_val; \ - tmpVal0 = convert_int4_rte(norm); \ - norm = tmpData1 * alpha + bias_val; \ - tmpVal1 = convert_int4_rte(norm); \ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ - norm = tmpData2 * alpha + bias_val; \ - tmpVal0 = convert_int4_rte(norm); \ - norm = tmpData3 * alpha + bias_val; \ - tmpVal1 = convert_int4_rte(norm); \ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ - VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ - } \ -} -INSTANCENORM_8BITS_F32(U8, vxc_uchar16) -INSTANCENORM_8BITS_F32(I8, vxc_char16) - -#define INSTANCENORM_8BITS_F32_2D(src1_type_name, read_type) \ -__kernel void instance_norm_##src1_type_name##F32to##src1_type_name##_2D( \ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \ - image2d_array_t output, float eps, int rsFlg) \ -{ \ - int gidz = get_global_id(1); \ - int gidy = gidz * height; \ - int2 coord = (int2)(get_global_id(0), gidy); \ - int2 coord_para = (int2)(gidz, 0); \ - int endH = gidy + height; \ - read_type src0, src2; \ - float scale_vari, bias_val; \ - vxc_float4 mean_vari = (vxc_float4)(0); \ - \ - Image img1 = create_image_from_image2d(bias, 4); \ - Image img2 = create_image_from_image2d(scale, 4); \ - Image img3 = create_image_from_image2d(meanVari, 4); \ - __global float* bias_ptr = (__global float*)img1.ptr; \ - __global float* scal_ptr = (__global float*)img2.ptr; \ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \ - __global float4* vari_ptr = (__global float4*)sumVari_ptr; \ - \ - float bval = bias_ptr[gidz]; \ - float sval = scal_ptr[gidz]; \ - \ - for(int i = 0; i < group_num; i++) \ - { \ - mean_vari += vari_ptr[i]; \ - } \ - \ - mean_vari *= dimRatio; \ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \ - mean_vari.s1 = rsqrt(mean_vari.s1); \ - \ - scale_vari = sval * mean_vari.s1; \ - short zp = inputZP; \ - vxc_int4 tmpVal0, tmpVal1; \ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \ - float alpha = scale_inOut * scale_vari; \ - bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \ - \ - for(; coord.y < endH; coord.y++) \ - { \ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ - uniConvert1stUint8SubZpToFp32_4x4); \ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ - uniConvert2ndUint8SubZpToFp32_4x4); \ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ - uniConvert3rdUint8SubZpToFp32_4x4); \ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ - uniConvert4thUint8SubZpToFp32_4x4); \ - norm = tmpData0 * alpha + bias_val; \ - tmpVal0 = convert_int4_rte(norm); \ - norm = tmpData1 * alpha + bias_val; \ - tmpVal1 = convert_int4_rte(norm); \ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ - norm = tmpData2 * alpha + bias_val; \ - tmpVal0 = convert_int4_rte(norm); \ - norm = tmpData3 * alpha + bias_val; \ - tmpVal1 = convert_int4_rte(norm); \ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \ - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - } \ -} -INSTANCENORM_8BITS_F32_2D(U8, vxc_uchar16) -INSTANCENORM_8BITS_F32_2D(I8, vxc_char16) - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int2 coord_para = (int2)(gidz, 0); - vxc_short8 src0, src2; - float scale_vari, bias_val; - vxc_float4 mean_vari = (vxc_float4)(0); - - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - Image img3 = create_image_from_image2d(meanVari, 4); - __global float* bias_ptr = (__global float*)img1.ptr; - __global float* scal_ptr = (__global float*)img2.ptr; - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); - __global float4* vari_ptr = (__global float4*)sumVari_ptr; - - float bval = bias_ptr[gidz]; - float sval = scal_ptr[gidz]; - - for(int i = 0; i < group_num; i++) - { - mean_vari += vari_ptr[i]; - } - - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = sval * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toInt16_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord, src2, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16_2D( - image2d_t input, image2d_t bias, image2d_t scale, - image2d_t meanVari, image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int2 coord = (int2)(get_global_id(0), gidy); - int2 coord_para = (int2)(gidz, 0); - int endH = gidy + height; - vxc_short8 src0, src2; - float scale_vari, bias_val; - vxc_float4 mean_vari = (vxc_float4)(0); - - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - Image img3 = create_image_from_image2d(meanVari, 4); - __global float* bias_ptr = (__global float*)img1.ptr; - __global float* scal_ptr = (__global float*)img2.ptr; - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); - __global float4* vari_ptr = (__global float4*)sumVari_ptr; - - float bval = bias_ptr[gidz]; - float sval = scal_ptr[gidz]; - - for(int i = 0; i < group_num; i++) - { - mean_vari += vari_ptr[i]; - } - - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = sval * mean_vari.s1; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1; - float alpha = inOut_fl_scale * scale_vari; - bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale; - - for(; coord.y < endH; coord.y++) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Fst_4x4); - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertInt16Fp32Secd_4x4); - vxc_float4 norm; - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toInt16_2x8); - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx deleted file mode 100644 index d51e38e..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx +++ /dev/null @@ -1,146 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int height; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; - -_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - vxc_short8 src0; - vxc_half8 in_h; - float scale_vari, bias_val; - vxc_float4 mean_vari = (vxc_float4)(0); - - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - Image img3 = create_image_from_image2d(meanVari, 4); - __global float* bias_ptr = (__global float*)img1.ptr; - __global float* scal_ptr = (__global float*)img2.ptr; - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz)); - __global float4* vari_ptr = (__global float4*)sumVari_ptr; - - float bval = bias_ptr[gidz]; - float sval = scal_ptr[gidz]; - - for(int i = 0; i < group_num; i++) - { - mean_vari += vari_ptr[i]; - } - - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = sval * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - bias_val = (bval - scale_vari * mean_vari.s0); - vxc_half8 dst; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, in_h, src0, 16); - - coord_in.y ++; - - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - - vxc_float4 norm; - norm = scale_vari * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_vari * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16_2D( - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int2 coord = (int2)(get_global_id(0), gidy); - int2 coord_para = (int2)(gidz, 0); - int endH = gidy + height; - vxc_short8 src0; - vxc_half8 in_h; - float scale_vari, bias_val; - vxc_float4 mean_vari = (vxc_float4)(0); - - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - Image img3 = create_image_from_image2d(meanVari, 4); - __global float* bias_ptr = (__global float*)img1.ptr; - __global float* scal_ptr = (__global float*)img2.ptr; - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); - __global float4* vari_ptr = (__global float4*)sumVari_ptr; - - float bval = bias_ptr[gidz]; - float sval = scal_ptr[gidz]; - - for(int i = 0; i < group_num; i++) - { - mean_vari += vari_ptr[i]; - } - - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = sval * mean_vari.s1; - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - bias_val = (bval - scale_vari * mean_vari.s0); - vxc_half8 dst; - - for(; coord.y < endH; coord.y++) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, in_h, src0, 16); - - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertEndInt16Fp32_4x4); - vxc_float4 norm; - norm = scale_vari * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_vari * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx deleted file mode 100644 index 5c0f235..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx +++ /dev/null @@ -1,254 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniSumU8_16x1; -_viv_uniform VXC_512Bits uniSqrSum_16x1; -_viv_uniform float input_scale; -_viv_uniform int inputZP; -_viv_uniform int sumInZp; -_viv_uniform int tmpZp1; -_viv_uniform float e2InScale; -_viv_uniform float rowSumScale; -_viv_uniform float scale_inOut; -_viv_uniform float outputScale; -_viv_uniform int output_ZP; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8( - image2d_array_t input, image2d_array_t output, float eps, int rsFlg) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, gidz); - vxc_uchar16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, 0, \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); - } - sqr += (tmpSqr * e2InScale + rowSumScale); - sum = (tmpSum + sumInZp) * input_scale; - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D( - image2d_array_t input, image2d_array_t output, float eps, int rsFlg) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int gidy = gidz * height; - - int2 coord = (int2)(gidx, gidy); - vxc_uchar16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; - int endH = gidy + height; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - if(gidx < width) - { - for(; coord.y < endH;) - { - VXC_ReadImage(src0, input, coord, 0, - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); - } - sqr += (tmpSqr * e2InScale + rowSumScale); - sum = (tmpSum + sumInZp) * input_scale; - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_para = (int4)(gidz, 0, 0, 0); - vxc_uchar16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, 0,\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = scale_inOut * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, 0, \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int2 coord = (int2)(get_global_id(0), gidy); - int4 coord_para = (int4)(gidz, 0, 0, 0); - int endH = gidy + height; - vxc_uchar16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, 0,\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - - bias_f = read_imagef(bias, coord_para); - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - float alpha = scale_inOut * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP; - - for(; coord.y < endH; coord.y++) - { - VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = tmpData0 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData1 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - norm = tmpData2 * alpha + bias_val; - tmpVal0 = convert_int4_rte(norm); - norm = tmpData3 * alpha + bias_val; - tmpVal1 = convert_int4_rte(norm); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx deleted file mode 100644 index b737ffe..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx +++ /dev/null @@ -1,147 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform int width; -_viv_uniform int height; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; - -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform float input_scale; -_viv_uniform int inputZP; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_para = (int4)(gidz, 0, 0, 0); - vxc_uchar16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - bias_f = read_imagef(bias, coord_para); - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - - coord_para = coord; - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_para.z, baseAddr); - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_para.xy = coord.xy; - coord.y++; - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - coord_para.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps, int rsFlg) -{ - int gidz = get_global_id(1); - int gidy = gidz * height; - int4 coord = (int4)(get_global_id(0), gidy, 0, 0); - int4 coord_para = (int4)(gidz, 0, 0, 0); - int endH = gidy + height; - vxc_uchar16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - float scale_vari, bias_val; - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0); - - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4); - bias_f = read_imagef(bias, coord_para); - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_para.yx); - coord_para.y += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - scale_vari = scale_f.s0 * mean_vari.s1; - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; - half4 tmpVal0, tmpVal1; - float alpha = input_scale * scale_vari; - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); - for(; coord.y < endH;) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_para = coord; - coord.y++; - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4); - norm = alpha * tmpData0 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData1 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_para.x += 8; - norm = alpha * tmpData2 + bias_val; - _viv_asm(CONV, tmpVal0, norm); - norm = alpha * tmpData3 + bias_val; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx index bd3a733..95d9c87 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx @@ -94,6 +94,7 @@ _viv_uniform float zpSqrt16x; _viv_uniform VXC_512Bits uniSumAll_16x1; _viv_uniform int inputZP; + #define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \ vxc_float4 rsqrt0;\ Image dst_img = create_image_from_image2d(output, 1); \ @@ -143,31 +144,31 @@ _viv_uniform int inputZP; dst_ptr[0] = dst.s0; \ break; \ case 2: \ - VXC_Vstore2(dst_ptr, 0, dst); \ + VXC_Vstore2(dst_ptr, 0, dst.s01); \ break; \ case 3: \ - VXC_Vstore3(dst_ptr, 0, dst); \ + VXC_Vstore3(dst_ptr, 0, dst.s012); \ break; \ case 4: \ - VXC_Vstore4(dst_ptr, 0, dst); \ + VXC_Vstore4(dst_ptr, 0, dst.0123); \ break; \ case 5: \ - VXC_Vstore2(dst_ptr, 0, dst); \ + VXC_Vstore2(dst_ptr, 0, dst.s01); \ dst.s012 = dst.s234; \ dst_ptr += 2; \ - VXC_Vstore3(dst_ptr, 0, dst); \ + VXC_Vstore3(dst_ptr, 0, dst.s012); \ break; \ case 6: \ - VXC_Vstore3(dst_ptr, 0, dst); \ + VXC_Vstore3(dst_ptr, 0, dst.s012); \ dst.s012 = dst.s345; \ dst_ptr += 3; \ - VXC_Vstore3(dst_ptr, 0, dst); \ + VXC_Vstore3(dst_ptr, 0, dst.s012); \ break; \ case 7: \ - VXC_Vstore4(dst_ptr, 0, dst); \ + VXC_Vstore4(dst_ptr, 0, dst.0123); \ dst.s012 = dst.s456; \ dst_ptr += 4; \ - VXC_Vstore3(dst_ptr, 0, dst); \ + VXC_Vstore3(dst_ptr, 0, dst.s012); \ break; \ default: \ VXC_Vstore8(dst_ptr, 0, dst); \ @@ -177,16 +178,13 @@ _viv_uniform int inputZP; } \ -#define L2NORMSCALE_AXIS0_2D(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \ +#define L2NORMSCALE_AXIS0(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \ dst_type, convert_type, output_type, copy_type) \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ - void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \ + void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name \ (\ - __read_only image2d_t input,\ - __read_only image2d_t scale,\ - __write_only image2d_t output,\ - int axis\ - )\ + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output,\ + int axis )\ { \ int lidx = get_local_id(0); \ int offset = get_global_id(0); \ @@ -236,19 +234,15 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \ } -L2NORMSCALE_AXIS0_2D(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \ +L2NORMSCALE_AXIS0(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \ ushort, half4, vxc_half8, vxc_ushort8) -#define L2NORMSCALE_AXIS0_QNT_2D(in0_name, in1_name, out_name,\ +#define L2NORMSCALE_AXIS0_QNT(in0_name, in1_name, out_name,\ src_type, src_scalar_type, dst_type, convert_type, output_type, copy_type) \ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ -void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \ +void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name \ (\ - __read_only image2d_t input,\ - __read_only image2d_t scale,\ - __write_only image2d_t output,\ - int axis\ - )\ + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\ { \ int lidx = get_local_id(0); \ int offset = get_global_id(0); \ @@ -302,9 +296,9 @@ void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \ L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \ } -L2NORMSCALE_AXIS0_QNT_2D(U8, F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8, vxc_ushort8) -L2NORMSCALE_AXIS0_QNT_2D(U8, F16, U8, vxc_uchar8, uchar, uchar, int4, vxc_uchar8, vxc_uchar8) -L2NORMSCALE_AXIS0_QNT_2D(I8, F16, F16, vxc_char8, char, ushort, half4, vxc_half8, vxc_ushort8) -L2NORMSCALE_AXIS0_QNT_2D(I8, F16, I8, vxc_char8, char, char, int4, vxc_char8, vxc_char8) -L2NORMSCALE_AXIS0_QNT_2D(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8, vxc_ushort8) -L2NORMSCALE_AXIS0_QNT_2D(I16, F16, I16, vxc_short8, short, short, int4, vxc_short8, vxc_short8) +L2NORMSCALE_AXIS0_QNT(U8, F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8, vxc_ushort8) +L2NORMSCALE_AXIS0_QNT(U8, F16, U8, vxc_uchar8, uchar, uchar, int4, vxc_uchar8, vxc_uchar8) +L2NORMSCALE_AXIS0_QNT(I8, F16, F16, vxc_char8, char, ushort, half4, vxc_half8, vxc_ushort8) +L2NORMSCALE_AXIS0_QNT(I8, F16, I8, vxc_char8, char, char, int4, vxc_char8, vxc_char8) +L2NORMSCALE_AXIS0_QNT(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8, vxc_ushort8) +L2NORMSCALE_AXIS0_QNT(I16, F16, I16, vxc_short8, short, short, int4, vxc_short8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0_2d.vx new file mode 100644 index 0000000..f214d53 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0_2d.vx @@ -0,0 +1,207 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform int inputWidth; +_viv_uniform float output_ZP; +_viv_uniform float zP2x; +_viv_uniform int inputZP; + +_viv_uniform float inOutScale; +_viv_uniform float e2InScale; +_viv_uniform float zpSqr8x; +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; + +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; + +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) + void l2normalizescale_axis0_F16_F16toF16_2D( + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); + int lidx = get_local_id(0); + vxc_short8 src0, src1, dst; + vxc_half8 in_h, scale_h, tmpDst; + float sum = 0; + vxc_float4 scale_f0, scale_f1, sumsqr, tmpData0, tmpData1; + __local float lcl_sum[16]; + float4 one = (float4)(1, 1, 1, 1); + for(; coord.x < inputWidth;) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + coord.x += 128; + _viv_asm(COPY, in_h, src0, 16); + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \ + uniFp16SumSqr_dp8x2); + sum += sumsqr.y; + } + lcl_sum[lidx] = sum; + barrier(CLK_LOCAL_MEM_FENCE); + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; + float4 data0; + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; + sum = dot(data0, one); + float alpha = rsqrt(sum); + + for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128) + { + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + + _viv_asm(COPY, in_h, src0, 16); + _viv_asm(COPY, scale_h, src1, 16); + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); + + half4 tmpVal0, tmpVal1; + tmpData0 *= scale_f0 * alpha; + tmpData1 *= scale_f1 * alpha; + _viv_asm(CONV, tmpVal0, tmpData0); + _viv_asm(CONV, tmpVal1, tmpData1); + VXC_DP2x8(tmpDst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); + _viv_asm(COPY, dst, tmpDst, 16); + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + } +} + + +#define L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(in0_name, in1_name, out_name, read_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ + void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D( \ + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + int lidx = get_local_id(0); \ + read_type src0, dst; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float sum = 0; \ + vxc_float4 scale_f0, scale_f1, sumsqr; \ + __local float lcl_sum[16]; \ + float4 one = (float4)(1, 1, 1, 1); \ + for(; coord.x < inputWidth;) \ + { \ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 128; \ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \ + uniInt16SumSqr_dp8x2); \ + sum += sumsqr.y - zP2x * sumsqr.x + zpSqr8x; \ + } \ + sum *= e2InScale; \ + lcl_sum[lidx] = sum; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \ + float4 data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \ + sum = dot(data0, one); \ + float alpha = rsqrt(sum) * inOutScale; \ + short zp = inputZP; \ + vxc_float4 tmpData0, tmpData1; \ + for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128) \ + { \ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); \ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); \ + \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); \ + \ + int4 tmpVal0 = convert_int4_rte(tmpData0 * scale_f0 * alpha + output_ZP); \ + int4 tmpVal1 = convert_int4_rte(tmpData1 * scale_f1 * alpha + output_ZP); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ + uniConvertInt32toUint8_2x8); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} + +L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(U8, F16, U8, vxc_uchar8) +L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(I8, F16, I8, vxc_char8) +L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(I16, F16, I16, vxc_short8) + +#define L2NORMSCALE_QINTF16TOF16_AXIS0_2D(in0_name, in1_name, out_name, read_type) \ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \ + void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D( \ + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \ + int lidx = get_local_id(0); \ + read_type src0; \ + vxc_short8 src1, dst; \ + vxc_half8 scale_h, tmpDst; \ + float sum = 0; \ + vxc_float4 scale_f0, scale_f1, sumsqr, tmpData0, tmpData1; \ + __local float lcl_sum[16]; \ + float4 one = (float4)(1, 1, 1, 1); \ + for(; coord.x < inputWidth;) \ + { \ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 128; \ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \ + uniInt16SumSqr_dp8x2); \ + sum += sumsqr.y - zP2x * sumsqr.x + zpSqr8x; \ + } \ + sum *= e2InScale; \ + lcl_sum[lidx] = sum; \ + barrier(CLK_LOCAL_MEM_FENCE); \ + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \ + float4 data0; \ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \ + sum = dot(data0, one); \ + float alpha = rsqrt(sum) * inOutScale; \ + short zp = inputZP; \ + for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128) \ + { \ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, scale_h, src1, 16); \ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + UniFP16toFP32Lo4_dp4x4); \ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvertSecFp16Fp32_4x4); \ + \ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert1stUint8SubZpToFp32_4x4); \ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ + uniConvert2ndUint8SubZpToFp32_4x4); \ + \ + half4 tmpVal0, tmpVal1; \ + tmpData0 *= scale_f0 * alpha; \ + tmpData1 *= scale_f1 * alpha; \ + _viv_asm(CONV, tmpVal0, tmpData0); \ + _viv_asm(CONV, tmpVal1, tmpData1); \ + VXC_DP2x8(tmpDst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniConvertHalfToFp16_2x8); \ + _viv_asm(COPY, dst, tmpDst, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} + +L2NORMSCALE_QINTF16TOF16_AXIS0_2D(U8, F16, F16, vxc_uchar8) +L2NORMSCALE_QINTF16TOF16_AXIS0_2D(I8, F16, F16, vxc_char8) +L2NORMSCALE_QINTF16TOF16_AXIS0_2D(I16, F16, F16, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx deleted file mode 100644 index c0a6e19..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization.vx +++ /dev/null @@ -1,279 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/**************************layernorm float16***********************************/ -_viv_uniform int width; -_viv_uniform float dimRatio; -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4; - -__kernel void layer_norm_F16toF16( - image2d_array_t input, image2d_t bias, image2d_t scale, - image2d_array_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - int4 coord_out = coord; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - vxc_short8 src0, src1; - vxc_float sum = 0, sqr = 0; - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.z, baseAddr); - - for(coord.x = 8; coord.x < (width+8); coord.x += 8) - { - vxc_half8 val0_h; - _viv_asm(COPY, val0_h, src0, 16); - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - sum += sumsqr.x; - sqr += sumsqr.y; - } - vxc_float mean; - mean = sum * dimRatio; - vxc_float vari; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_float4 bias_f; - for(coord.x = 0; coord.x < width; coord.x += 4) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord.xw); - vxc_half8 in_h, scale_h; - _viv_asm(COPY, in_h, src0, 16); - _viv_asm(COPY, scale_h, src1, 16); - vxc_float4 in_f, scale_f; - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - vxc_float4 sub, norm; - sub = in_f - mean; - norm = scale_f * vari * sub + bias_f; - half4 norm_h; - _viv_asm(CONV, norm_h, norm); - vxc_half8 dst; - VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniExtractHalf4_dp4x4); - vxc_short8 dstval; - _viv_asm(COPY, dstval, dst, 16); - coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \ - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - } -} -/*****************************layernorm uint8 to uint8****************************/ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniSumU8_16x1; -_viv_uniform VXC_512Bits uniSqrSum_16x1; -_viv_uniform float input_scale; -_viv_uniform int inputZP; -_viv_uniform float outputScale; -_viv_uniform float output_zp; -_viv_uniform int sumInZp; -_viv_uniform int tmpZp1; -_viv_uniform int tmpZp2; -_viv_uniform float e2InScale; -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -__kernel void layer_norm_U8toU8( - image2d_array_t input, image2d_t bias, image2d_t scale, - image2d_array_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - int4 coord_out = coord; - - vxc_uchar16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float sum = 0, sqr = 0; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - int tmpSum = 0, tmpSqr = 0; - vxc_int4 tmpSum1; - vxc_int4 tmpSqr1; - short zp = inputZP; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.z, baseAddr); - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1.x); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); - } - sum = (tmpSum + sumInZp) * input_scale; - sqr = (tmpSqr + tmpZp2) * e2InScale; - - float mean, vari; - mean = sum * dimRatio; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - int2 coord_bias = (int2)(0, 0); - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.x = coord.x; - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - tmpData0 *= input_scale; - tmpData1 *= input_scale; - tmpData2 *= input_scale; - tmpData3 *= input_scale; - - vxc_float4 norm; - tmpData0 -= mean; - norm = scale_f0 * vari * tmpData0 + bias_f0; - bias_f0 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - coord_bias.x += 4; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - - tmpData1 -= mean; - norm = scale_f1 * vari * tmpData1 + bias_f1; - bias_f1 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - - tmpData2 -= mean; - norm = scale_f0 * vari * tmpData2 + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - - tmpData3 -= mean; - norm = scale_f1 * vari * tmpData3 + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \ - VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - } -} -/***************************layernorm float16 to uint8**************************/ -__kernel void layer_norm_F16toU8( - image2d_array_t input, image2d_t bias, image2d_t scale, - image2d_array_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - int4 coord_out = coord; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - vxc_short8 src0, src1; - vxc_float sum = 0, sqr = 0; - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.z, baseAddr); - - for(coord.x = 8; coord.x < (width+8); coord.x += 8) - { - vxc_half8 val0_h; - _viv_asm(COPY, val0_h, src0, 16); - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - sum += sumsqr.x; - sqr += sumsqr.y; - } - vxc_float mean; - mean = sum * dimRatio; - vxc_float vari; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_float4 bias_f; - for(coord.x = 0; coord.x < width; coord.x += 4) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord.xw); - vxc_half8 in_h, scale_h; - _viv_asm(COPY, in_h, src0, 16); - _viv_asm(COPY, scale_h, src1, 16); - vxc_float4 in_f, scale_f; - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - vxc_float4 sub, norm; - sub = in_f - mean; - norm = scale_f * vari * sub + bias_f; - norm = norm * outputScale + output_zp; - int4 output_int4; - output_int4 = convert_int4_rte(norm); - vxc_uchar8 dst; - VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), - uniConvertInt32toUint8_2x8); - coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \ - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_0.vx new file mode 100644 index 0000000..5674bc8 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_0.vx @@ -0,0 +1,390 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniSumX_16x1; +_viv_uniform VXC_512Bits uniSumX2_16x1; +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform int width; +_viv_uniform float inv_multiplier; +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define CONV2F32(dst, src, section) \ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniDataToFP32_##section##_4x4); + +#define LAYER_NORM_8BITS_IMPL(name, src_type) \ +__kernel void layer_norm_axis0_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \ + int4 coord_out = coord; \ + \ + src_type src0, dst; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float sum = 0, sqr = 0; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + uint2 _sums = 0, sum_x_x2; \ + \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.z, baseAddr); \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 16; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = convert_float2(_sums) * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \ + int2 coord_bias = (int2)(0, 0); \ + \ + for(coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_bias.x = coord.x; \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(scale_f0, scale_h, 0); \ + CONV2F32(scale_f1, scale_h, 1); \ + bias_f0 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + \ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + CONV2F32(tmpData2, src0, 2); \ + CONV2F32(tmpData3, src0, 3); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + bias_f0 = read_imagef(bias, coord_bias); \ + CONV2F32(scale_f0, scale_h, 0); \ + coord_bias.x += 4; \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + bias_f1 = read_imagef(bias, coord_bias); \ + CONV2F32(scale_f1, scale_h, 1); \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + \ + tmpData2 = tmpData2 - sums.x; \ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + \ + tmpData3 = tmpData3 - sums.x; \ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + coord_out.x = coord.x; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_8BITS_IMPL(U8_F16toU8, vxc_uchar16) +LAYER_NORM_8BITS_IMPL(I8_F16toI8, vxc_char16) + +#define LAYER_NORM_SUMS_2D() \ + uint2 _sums = 0, sum_x_x2; \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 16; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = convert_float2(_sums) * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); + +#define LAYER_NORM_8BITS_IMPL_2D(name, src_type) \ +__kernel void layer_norm_axis0_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \ + \ + src_type src0, dst; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float sum = 0, sqr = 0; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + \ + LAYER_NORM_SUMS_2D(); \ + \ + int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \ + int2 coord_bias = (int2)(0, 0); \ + \ + for (coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_bias.x = coord.x; \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(scale_f0, scale_h, 0); \ + CONV2F32(scale_f1, scale_h, 1); \ + bias_f0 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + \ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + CONV2F32(tmpData2, src0, 2); \ + CONV2F32(tmpData3, src0, 3); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + bias_f0 = read_imagef(bias, coord_bias); \ + CONV2F32(scale_f0, scale_h, 0); \ + coord_bias.x += 4; \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + bias_f1 = read_imagef(bias, coord_bias); \ + CONV2F32(scale_f1, scale_h, 1); \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + \ + tmpData2 = tmpData2 - sums.x; \ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + \ + tmpData3 = tmpData3 - sums.x; \ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_8BITS_IMPL_2D(U8_F16toU8, vxc_uchar16) +LAYER_NORM_8BITS_IMPL_2D(I8_F16toI8, vxc_char16) + +#define LAYER_NORM_8TOF16_IMPL(name, src_type) \ +__kernel void layer_norm_axis0_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \ + int4 coord_out = coord; \ + \ + src_type src0; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float sum = 0, sqr = 0; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + uint2 _sums = 0, sum_x_x2; \ + \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.z, baseAddr); \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 16; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = convert_float2(_sums) * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + half4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \ + int2 coord_bias = (int2)(0, 0); \ + \ + vxc_short8 dst; \ + vxc_half8 result; \ + for(coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_bias.x = coord.x; \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(scale_f0, scale_h, 0); \ + CONV2F32(scale_f1, scale_h, 1); \ + bias_f0 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + \ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + CONV2F32(tmpData2, src0, 2); \ + CONV2F32(tmpData3, src0, 3); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + bias_f0 = read_imagef(bias, coord_bias); \ + CONV2F32(scale_f0, scale_h, 0); \ + coord_bias.x += 4; \ + _viv_asm(CONV, tmpVal0, norm); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + bias_f1 = read_imagef(bias, coord_bias); \ + CONV2F32(scale_f1, scale_h, 1); \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + coord_out.x = coord.x; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + tmpData2 = tmpData2 - sums.x; \ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \ + _viv_asm(CONV, tmpVal0, norm); \ + \ + tmpData3 = tmpData3 - sums.x; \ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + coord_out.x += 8; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_8TOF16_IMPL(U8_F16toF16, vxc_uchar16) +LAYER_NORM_8TOF16_IMPL(I8_F16toF16, vxc_char16) + +#define LAYER_NORM_8TOF16_IMPL_2D(name, src_type) \ +__kernel void layer_norm_axis0_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \ + \ + src_type src0; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float sum = 0, sqr = 0; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + \ + LAYER_NORM_SUMS_2D(); \ + \ + half4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \ + int2 coord_bias = (int2)(0, 0); \ + \ + vxc_short8 dst; \ + vxc_half8 result; \ + for (coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_bias.x = coord.x; \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(scale_f0, scale_h, 0); \ + CONV2F32(scale_f1, scale_h, 1); \ + bias_f0 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + \ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + CONV2F32(tmpData2, src0, 2); \ + CONV2F32(tmpData3, src0, 3); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + bias_f0 = read_imagef(bias, coord_bias); \ + CONV2F32(scale_f0, scale_h, 0); \ + coord_bias.x += 4; \ + _viv_asm(CONV, tmpVal0, norm); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + bias_f1 = read_imagef(bias, coord_bias); \ + CONV2F32(scale_f1, scale_h, 1); \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + \ + tmpData2 = tmpData2 - sums.x; \ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \ + _viv_asm(CONV, tmpVal0, norm); \ + \ + tmpData3 = tmpData3 - sums.x; \ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x -= 8; \ + } \ +} +LAYER_NORM_8TOF16_IMPL_2D(U8_F16toF16, vxc_uchar16) +LAYER_NORM_8TOF16_IMPL_2D(I8_F16toF16, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_1.vx new file mode 100644 index 0000000..d2be567 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_1.vx @@ -0,0 +1,343 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniSum_X_X2_8x2; +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform int width; +_viv_uniform float inv_multiplier; +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define CONV2F32(dst, src, section) \ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniDataToFP32_##section##_4x4); + +#define LAYER_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \ +__kernel void layer_norm_axis0_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \ + int4 coord_out = coord; \ + \ + vxc_short8 in0; \ + src_type src0; \ + copy_type dst; \ + vxc_short8 src1; \ + dst_type result; \ + vxc_half8 scale_h; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + float2 _sums = 0, sum_x_x2; \ + \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.z, baseAddr); \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, in0, 16); \ + coord.x += 8; \ + VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = _sums * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + conv_type tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1; \ + int2 coord_bias = (int2)(0, 0); \ + \ + for(coord.x = 0; coord.x < width; coord.x += 8) \ + { \ + VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, in0, 16); \ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_bias.x = coord.x; \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(scale_f0, scale_h, 0); \ + CONV2F32(scale_f1, scale_h, 1); \ + bias_f0 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal0, norm); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + \ + coord_out.x = coord.x; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_16BITS_IMPL(F16_F16toF16, vxc_half8, vxc_half8, vxc_short8, half4) +LAYER_NORM_16BITS_IMPL(F16_F16toI16, vxc_half8, vxc_short8, vxc_short8, int4) +LAYER_NORM_16BITS_IMPL(F16_F16toI8, vxc_half8, vxc_char8, vxc_char8, int4) +LAYER_NORM_16BITS_IMPL(F16_F16toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +LAYER_NORM_16BITS_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4) +LAYER_NORM_16BITS_IMPL(I16_F16toF16, vxc_short8, vxc_half8, vxc_short8, half4) + +#define LAYER_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \ +__kernel void layer_norm_axis0_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \ + \ + vxc_short8 in0; \ + src_type src0; \ + copy_type dst; \ + dst_type result; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + float2 _sums = 0, sum_x_x2; \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, in0, 16); \ + coord.x += 8; \ + VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = _sums * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + \ + conv_type tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1; \ + int2 coord_bias = (int2)(0, 0); \ + \ + for (coord.x = 0; coord.x < width; coord.x += 8) \ + { \ + VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, in0, 16); \ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_bias.x = coord.x; \ + _viv_asm(COPY, scale_h, src1, 16); \ + CONV2F32(scale_f0, scale_h, 0); \ + CONV2F32(scale_f1, scale_h, 1); \ + bias_f0 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + bias_f1 = read_imagef(bias, coord_bias); \ + coord_bias.x += 4; \ + \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + coord_bias.x += 4; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal0, norm); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_16BITS_IMPL_2D(F16_F16toF16, vxc_half8, vxc_half8, vxc_short8, half4) +LAYER_NORM_16BITS_IMPL_2D(F16_F16toI16, vxc_half8, vxc_short8, vxc_short8, int4) +LAYER_NORM_16BITS_IMPL_2D(F16_F16toI8, vxc_half8, vxc_char8, vxc_char8, int4) +LAYER_NORM_16BITS_IMPL_2D(F16_F16toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +LAYER_NORM_16BITS_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4) +LAYER_NORM_16BITS_IMPL_2D(I16_F16toF16, vxc_short8, vxc_half8, vxc_short8, half4) + +#define LAYER_NORM_16_32_IMPL(name, src_type, dst_type, copy_type, conv_type) \ +__kernel void layer_norm_axis0_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \ + int4 coord_out = coord; \ + \ + vxc_short8 in0; \ + src_type src0; \ + copy_type dst; \ + dst_type result; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + float2 _sums = 0, sum_x_x2; \ + \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.z, baseAddr); \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, in0, 16); \ + coord.x += 8; \ + VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = _sums * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + conv_type tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1; \ + \ + Image img1 = create_image_from_image2d(bias, 4); \ + Image img2 = create_image_from_image2d(scale, 4); \ + __global float* bias_ptr = (__global float*)img1.ptr; \ + __global float* scale_ptr = (__global float*)img2.ptr; \ + for(coord.x = 0; coord.x < width; coord.x += 8) \ + { \ + VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, in0, 16); \ + bias_f0 = vload4(0, bias_ptr); \ + bias_f1 = vload4(1, bias_ptr); \ + scale_f0 = vload4(0, scale_ptr); \ + scale_f1 = vload4(1, scale_ptr); \ + bias_ptr += 8; \ + scale_ptr += 8; \ + \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal0, norm); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + \ + coord_out.x = coord.x; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_16_32_IMPL(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4) +LAYER_NORM_16_32_IMPL(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4) +LAYER_NORM_16_32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4) +LAYER_NORM_16_32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +LAYER_NORM_16_32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) +LAYER_NORM_16_32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) + +#define LAYER_NORM_16_32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \ +__kernel void layer_norm_axis0_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \ + \ + vxc_short8 in0; \ + src_type src0; \ + copy_type dst; \ + dst_type result; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + float2 _sums = 0, sum_x_x2; \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, in0, 16); \ + coord.x += 8; \ + VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = _sums * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + \ + conv_type tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1; \ + \ + Image img1 = create_image_from_image2d(bias, 4); \ + Image img2 = create_image_from_image2d(scale, 4); \ + __global float* bias_ptr = (__global float*)img1.ptr; \ + __global float* scale_ptr = (__global float*)img2.ptr; \ + for (coord.x = 0; coord.x < width; coord.x += 8) \ + { \ + VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, src0, in0, 16); \ + bias_f0 = vload4(0, bias_ptr); \ + bias_f1 = vload4(1, bias_ptr); \ + scale_f0 = vload4(0, scale_ptr); \ + scale_f1 = vload4(1, scale_ptr); \ + bias_ptr += 8; \ + scale_ptr += 8; \ + \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal0, norm); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + norm = norm * output_scale + output_zp; \ + _viv_asm(CONV_RTE, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_16_32_IMPL_2D(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4) +LAYER_NORM_16_32_IMPL_2D(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4) +LAYER_NORM_16_32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4) +LAYER_NORM_16_32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4) +LAYER_NORM_16_32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4) +LAYER_NORM_16_32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2.vx new file mode 100644 index 0000000..a45a0ab --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2.vx @@ -0,0 +1,385 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniSumX_16x1; +_viv_uniform VXC_512Bits uniSumX2_16x1; +_viv_uniform VXC_512Bits uniSum_X_X2_8x2; +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4; +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; +_viv_uniform int width; +_viv_uniform float inv_multiplier; +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define CONV2F32(dst, src, section) \ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniDataToFP32_##section##_4x4); + +#define LAYER_NORM_8_32_IMPL(name, src_type) \ +__kernel void layer_norm_axis0_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \ + int4 coord_out = coord; \ + \ + src_type src0, dst; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float sum = 0, sqr = 0; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + uint2 _sums = 0, sum_x_x2; \ + \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.z, baseAddr); \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 16; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = convert_float2(_sums) * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \ + \ + Image img1 = create_image_from_image2d(bias, 4); \ + Image img2 = create_image_from_image2d(scale, 4); \ + __global float* bias_ptr = (__global float*)img1.ptr; \ + __global float* scale_ptr = (__global float*)img2.ptr; \ + for(coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + bias_f0 = vload4(0, bias_ptr); \ + bias_f1 = vload4(1, bias_ptr); \ + scale_f0 = vload4(0, scale_ptr); \ + scale_f1 = vload4(1, scale_ptr); \ + \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + CONV2F32(tmpData2, src0, 2); \ + CONV2F32(tmpData3, src0, 3); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + bias_f0 = vload4(2, bias_ptr); \ + scale_f0 = vload4(2, scale_ptr); \ + \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + bias_f1 = vload4(3, bias_ptr); \ + scale_f1 = vload4(3, scale_ptr); \ + bias_ptr += 16; \ + scale_ptr += 16; \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + \ + tmpData2 = tmpData2 - sums.x; \ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + \ + tmpData3 = tmpData3 - sums.x; \ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + coord_out.x = coord.x; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_8_32_IMPL(U8_F32toU8, vxc_uchar16) +LAYER_NORM_8_32_IMPL(I8_F32toI8, vxc_char16) + +#define LAYER_NORM_8_32_IMPL_2D(name, src_type) \ +__kernel void layer_norm_axis0_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \ + \ + src_type src0, dst; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float sum = 0, sqr = 0; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + uint2 _sums = 0, sum_x_x2; \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 16; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = convert_float2(_sums) * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + int4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \ + \ + Image img1 = create_image_from_image2d(bias, 4); \ + Image img2 = create_image_from_image2d(scale, 4); \ + __global float* bias_ptr = (__global float*)img1.ptr; \ + __global float* scale_ptr = (__global float*)img2.ptr; \ + for (coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + bias_f0 = vload4(0, bias_ptr); \ + bias_f1 = vload4(1, bias_ptr); \ + scale_f0 = vload4(0, scale_ptr); \ + scale_f1 = vload4(1, scale_ptr); \ + \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + CONV2F32(tmpData2, src0, 2); \ + CONV2F32(tmpData3, src0, 3); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + bias_f0 = vload4(2, bias_ptr); \ + scale_f0 = vload4(2, scale_ptr); \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + \ + bias_f1 = vload4(3, bias_ptr); \ + scale_f1 = vload4(3, scale_ptr); \ + bias_ptr += 16; \ + scale_ptr += 16; \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + \ + tmpData2 = tmpData2 - sums.x; \ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \ + \ + tmpData3 = tmpData3 - sums.x; \ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_8_32_IMPL_2D(U8_F32toU8, vxc_uchar16) +LAYER_NORM_8_32_IMPL_2D(I8_F32toI8, vxc_char16) + +#define LAYER_NORM_8_32TOF16_IMPL(name, src_type) \ +__kernel void layer_norm_axis0_##name( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \ + int4 coord_out = coord; \ + \ + src_type src0; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float sum = 0, sqr = 0; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + uint2 _sums = 0, sum_x_x2; \ + \ + int8 input_desc, output_desc; \ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \ + _viv_asm(MOV, coord.z, baseAddr_a); \ + \ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ + _viv_asm(MOV, coord_out.z, baseAddr); \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 16; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = convert_float2(_sums) * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + half4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \ + \ + vxc_short8 dst; \ + vxc_half8 result; \ + Image img1 = create_image_from_image2d(bias, 4); \ + Image img2 = create_image_from_image2d(scale, 4); \ + __global float* bias_ptr = (__global float*)img1.ptr; \ + __global float* scale_ptr = (__global float*)img2.ptr; \ + for(coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + bias_f0 = vload4(0, bias_ptr); \ + bias_f1 = vload4(1, bias_ptr); \ + scale_f0 = vload4(0, scale_ptr); \ + scale_f1 = vload4(1, scale_ptr); \ + \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + CONV2F32(tmpData2, src0, 2); \ + CONV2F32(tmpData3, src0, 3); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + bias_f0 = vload4(2, bias_ptr); \ + scale_f0 = vload4(2, scale_ptr); \ + _viv_asm(CONV, tmpVal0, norm); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + \ + bias_f1 = vload4(3, bias_ptr); \ + scale_f1 = vload4(3, scale_ptr); \ + bias_ptr += 16; \ + scale_ptr += 16; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + coord_out.x = coord.x; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + \ + tmpData2 = tmpData2 - sums.x; \ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \ + _viv_asm(CONV, tmpVal0, norm); \ + \ + tmpData3 = tmpData3 - sums.x; \ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + coord_out.x += 8; \ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ + } \ +} +LAYER_NORM_8_32TOF16_IMPL(U8_F32toF16, vxc_uchar16) +LAYER_NORM_8_32TOF16_IMPL(I8_F32toF16, vxc_char16) + +#define LAYER_NORM_8_32TOF16_IMPL_2D(name, src_type) \ +__kernel void layer_norm_axis0_##name##_2D( \ + __read_only image2d_array_t input, \ + __read_only image2d_t bias, \ + __read_only image2d_t scale, \ + __write_only image2d_array_t output, \ + float eps) \ +{ \ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \ + \ + src_type src0; \ + vxc_short8 src1; \ + vxc_half8 scale_h; \ + float sum = 0, sqr = 0; \ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \ + uint2 _sums = 0, sum_x_x2; \ + \ + for (coord.x = 0; coord.x < width; ) \ + { \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 16; \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \ + _sums = _sums + sum_x_x2; \ + } \ + \ + float2 sums = convert_float2(_sums) * inv_multiplier; \ + \ + sums.y = sums.y - sums.x * sums.x + eps; \ + sums.y = rsqrt(sums.y); \ + half4 tmpVal0, tmpVal1; \ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \ + \ + vxc_short8 dst; \ + vxc_half8 result; \ + Image img1 = create_image_from_image2d(bias, 4); \ + Image img2 = create_image_from_image2d(scale, 4); \ + __global float* bias_ptr = (__global float*)img1.ptr; \ + __global float* scale_ptr = (__global float*)img2.ptr; \ + for (coord.x = 0; coord.x < width; coord.x += 16) \ + { \ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + bias_f0 = vload4(0, bias_ptr); \ + bias_f1 = vload4(1, bias_ptr); \ + scale_f0 = vload4(0, scale_ptr); \ + scale_f1 = vload4(1, scale_ptr); \ + bias_ptr += 8; \ + scale_ptr += 8; \ + \ + CONV2F32(tmpData0, src0, 0); \ + CONV2F32(tmpData1, src0, 1); \ + CONV2F32(tmpData2, src0, 2); \ + CONV2F32(tmpData3, src0, 3); \ + \ + float4 norm; \ + tmpData0 = tmpData0 - sums.x; \ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \ + bias_f0 = vload4(2, bias_ptr); \ + scale_f0 = vload4(2, scale_ptr); \ + _viv_asm(CONV, tmpVal0, norm); \ + \ + tmpData1 = tmpData1 - sums.x; \ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \ + bias_f1 = vload4(3, bias_ptr); \ + scale_f1 = vload4(3, scale_ptr); \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x += 8; \ + \ + tmpData2 = tmpData2 - sums.x; \ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \ + _viv_asm(CONV, tmpVal0, norm); \ + \ + tmpData3 = tmpData3 - sums.x; \ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \ + _viv_asm(CONV, tmpVal1, norm); \ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, result, 16); \ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + coord.x -= 8; \ + } \ +} +LAYER_NORM_8_32TOF16_IMPL_2D(U8_F32toF16, vxc_uchar16) +LAYER_NORM_8_32TOF16_IMPL_2D(I8_F32toF16, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx deleted file mode 100644 index d517d7d..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_2d.vx +++ /dev/null @@ -1,234 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/**************************layernorm float16***********************************/ -_viv_uniform int width; -_viv_uniform float dimRatio; -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4; - -__kernel void layer_norm_F16toF16_2D( - image2d_t input, image2d_t bias, image2d_t scale, - image2d_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), 0, 0); - vxc_short8 src0, src1; - vxc_float sum = 0, sqr = 0; - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - for(coord.x = 8; coord.x < (width+8); coord.x += 8) - { - vxc_half8 val0_h; - _viv_asm(COPY, val0_h, src0, 16); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - sum += sumsqr.x; - sqr += sumsqr.y; - } - vxc_float mean; - mean = sum * dimRatio; - vxc_float vari; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_float4 bias_f; - for(coord.x = 0; coord.x < width; coord.x += 4) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord.xw); - vxc_half8 in_h, scale_h; - _viv_asm(COPY, in_h, src0, 16); - _viv_asm(COPY, scale_h, src1, 16); - vxc_float4 in_f, scale_f; - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - vxc_float4 sub, norm; - sub = in_f - mean; - norm = scale_f * vari * sub + bias_f; - half4 norm_h; - _viv_asm(CONV, norm_h, norm); - vxc_half8 dst; - VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniExtractHalf4_dp4x4); - vxc_short8 dstval; - _viv_asm(COPY, dstval, dst, 16); - VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - } -} -/*****************************layernorm uint8 to uint8****************************/ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniSumU8_16x1; -_viv_uniform VXC_512Bits uniSqrSum_16x1; -_viv_uniform float input_scale; -_viv_uniform int inputZP; -_viv_uniform float outputScale; -_viv_uniform float output_zp; -_viv_uniform int sumInZp; -_viv_uniform int tmpZp1; -_viv_uniform int tmpZp2; -_viv_uniform float e2InScale; -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -__kernel void layer_norm_U8toU8_2D( - image2d_t input, image2d_t bias, image2d_t scale, - image2d_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), 0, 0); - vxc_uchar16 src0, src2; - vxc_short8 src1; - vxc_half8 scale_h; - float sum = 0, sqr = 0; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - int tmpSum = 0, tmpSqr = 0; - vxc_int4 tmpSum1; - vxc_int4 tmpSqr1; - short zp = inputZP; - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1.x); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); - } - sum = (tmpSum + sumInZp) * input_scale; - sqr = (tmpSqr + tmpZp2) * e2InScale; - - float mean, vari; - mean = sum * dimRatio; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - int2 coord_bias = (int2)(0, 0); - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.x = coord.x; - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - tmpData0 = tmpData0 * input_scale - mean; - tmpData1 = tmpData1 * input_scale - mean; - tmpData2 = tmpData2 * input_scale - mean; - tmpData3 = tmpData3 * input_scale - mean; - - vxc_float4 norm; - norm = scale_f0 * vari * tmpData0 + bias_f0; - bias_f0 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - coord_bias.x += 4; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - - norm = scale_f1 * vari * tmpData1 + bias_f1; - bias_f1 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - - norm = scale_f0 * vari * tmpData2 + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - - norm = scale_f1 * vari * tmpData3 + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - } -} -/***************************layernorm float16 to uint8**************************/ -__kernel void layer_norm_F16toU8_2D( - image2d_t input, image2d_t bias, image2d_t scale, - image2d_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), 0, 0); - vxc_short8 src0, src1; - vxc_float sum = 0, sqr = 0; - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - for(coord.x = 8; coord.x < (width+8); coord.x += 8) - { - vxc_half8 val0_h; - _viv_asm(COPY, val0_h, src0, 16); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - sum += sumsqr.x; - sqr += sumsqr.y; - } - vxc_float mean; - mean = sum * dimRatio; - vxc_float vari; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_float4 bias_f; - for(coord.x = 0; coord.x < width; coord.x += 4) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - bias_f = read_imagef(bias, coord.xw); - vxc_half8 in_h, scale_h; - _viv_asm(COPY, in_h, src0, 16); - _viv_asm(COPY, scale_h, src1, 16); - vxc_float4 in_f, scale_f; - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - vxc_float4 sub, norm; - sub = in_f - mean; - norm = scale_f * vari * sub + bias_f; - norm = norm * outputScale + output_zp; - int4 output_int4; - output_int4 = convert_int4_rte(norm); - vxc_uchar8 dst; - VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - } -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_3.vx similarity index 100% rename from src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx rename to src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_3.vx diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx deleted file mode 100644 index e461f28..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_i16.vx +++ /dev/null @@ -1,168 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/**************************layernorm float16***********************************/ -_viv_uniform int width; -_viv_uniform float dimRatio; -_viv_uniform float dimRatio_scale; -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform float e2InScale; -_viv_uniform float outputScale; -_viv_uniform float output_zp; -_viv_uniform float input_scale; -_viv_uniform int inputZP; - -__kernel void layer_norm_I16toI16( - image2d_array_t input, image2d_t bias, image2d_t scale, - image2d_array_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - vxc_short8 src0, src1, dst; - vxc_float sum = 0, sqr = 0; - for(; coord_in.x < width;) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.x += 8; - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - sum += sumsqr.x; - sqr = sqr + sumsqr.y * e2InScale; - } - vxc_float mean; - mean = sum * dimRatio_scale; - vxc_float vari; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_half8 scale_h; - vxc_int4 tmpVal0, tmpVal1; - - int2 coord_bias = (int2)(0, 0); - - for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.x = coord_in.x; - VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - vxc_float4 sub, norm; - sub = tmpData0 * input_scale - mean; - norm = scale_f0 * vari * sub + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - sub = tmpData1 * input_scale - mean; - norm = scale_f1 * vari * sub + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord, dst, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel void layer_norm_I16toI16_2D( - image2d_t input, image2d_t bias, image2d_t scale, - image2d_t output, float eps) -{ - int2 coord = (int2)(0, get_global_id(1)); - - vxc_short8 src0, src1, dst; - vxc_float sum = 0, sqr = 0; - for(; coord.x < width;) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - sum += sumsqr.x; - sqr = sqr + sumsqr.y * e2InScale; - } - vxc_float mean, vari; - mean = sum * dimRatio_scale; - vari = sqr * dimRatio - mean * mean; - vari += eps; - vari = rsqrt(vari); - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_half8 scale_h; - vxc_int4 tmpVal0, tmpVal1; - - int2 coord_bias = (int2)(0, 0); - - for(coord.x = 0; coord.x < width; coord.x += 8) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.x = coord.x; - VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - vxc_float4 sub, norm; - sub = tmpData0 * input_scale - mean; - norm = scale_f0 * vari * sub + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - sub = tmpData1 * input_scale - mean; - norm = scale_f1 * vari * sub + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx deleted file mode 100644 index 221e93e..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx +++ /dev/null @@ -1,276 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/**************************layernorm float16***********************************/ -_viv_uniform int width; -_viv_uniform float dimRatio; -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4; - -__kernel void layer_norm_F16F32toF16( - image2d_array_t input, image2d_t bias, image2d_t scale, - image2d_array_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - int4 coord_out = coord; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - vxc_short8 src0; - vxc_float sum = 0, sqr = 0; - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.z, baseAddr); - - for(coord.x = 8; coord.x < (width+8); coord.x += 8) - { - vxc_half8 val0_h; - _viv_asm(COPY, val0_h, src0, 16); - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - sum += sumsqr.x; - sqr += sumsqr.y; - } - vxc_float mean; - mean = sum * dimRatio; - vxc_float vari; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_float4 bias_f, scale_f, in_f; - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0)); - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0)); - for(coord.x = 0; coord.x < width; coord.x += 4) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - bias_f = vload4(0, bias_ptr + coord.x); - scale_f = vload4(0, scale_ptr + coord.x); - vxc_half8 in_h; - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - vxc_float4 sub, norm; - sub = in_f - mean; - norm = scale_f * vari * sub + bias_f; - half4 norm_h; - _viv_asm(CONV, norm_h, norm); - vxc_half8 dst; - VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniExtractHalf4_dp4x4); - vxc_short8 dstval; - _viv_asm(COPY, dstval, dst, 16); - coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \ - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); - } -} -/*****************************layernorm uint8 to uint8****************************/ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniSumU8_16x1; -_viv_uniform VXC_512Bits uniSqrSum_16x1; -_viv_uniform float input_scale; -_viv_uniform int inputZP; -_viv_uniform float outputScale; -_viv_uniform float output_zp; -_viv_uniform int sumInZp; -_viv_uniform int tmpZp1; -_viv_uniform int tmpZp2; -_viv_uniform float e2InScale; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; -_viv_uniform float dimRatio_scale; - -__kernel void layer_norm_U8F32toU8( - image2d_array_t input, image2d_t bias, image2d_t scale, - image2d_array_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - int4 coord_out = coord; - - vxc_uchar16 src0, src2; - float sum = 0, sqr = 0; - vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3; - int tmpSum = 0, tmpSqr = 0; - vxc_int4 tmpSum1; - vxc_int4 tmpSqr1; - short zp = inputZP; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.z, baseAddr); - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1.x); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); - } - sum = (tmpSum + sumInZp) * input_scale; - sqr = (tmpSqr + tmpZp2) * e2InScale; - - float mean, vari; - mean = sum * dimRatio; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0)); - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0)); - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - bias_f0 = vload4(0, bias_ptr); - bias_f1 = vload4(1, bias_ptr); - bias_f2 = vload4(2, bias_ptr); - bias_f3 = vload4(3, bias_ptr); - scale_f0 = vload4(0, scale_ptr); - scale_f1 = vload4(1, scale_ptr); - scale_f2 = vload4(2, scale_ptr); - scale_f3 = vload4(3, scale_ptr); - bias_ptr += 16; - scale_ptr += 16; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - tmpData0 *= input_scale; - tmpData1 *= input_scale; - tmpData2 *= input_scale; - tmpData3 *= input_scale; - - vxc_float4 norm; - tmpData0 -= mean; - norm = scale_f0 * vari * tmpData0 + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - - tmpData1 -= mean; - norm = scale_f1 * vari * tmpData1 + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - - tmpData2 -= mean; - norm = scale_f2 * vari * tmpData2 + bias_f2; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - - tmpData3 -= mean; - norm = scale_f3 * vari * tmpData3 + bias_f3; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \ - VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel void layer_norm_I16F32toI16( - image2d_array_t input, image2d_t bias, image2d_t scale, - image2d_array_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - vxc_short8 src0, dst; - vxc_float sum = 0, sqr = 0; - for(; coord_in.x < width;) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.x += 8; - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - sum += sumsqr.x; - sqr = sqr + sumsqr.y * e2InScale; - } - vxc_float mean; - mean = sum * dimRatio_scale; - vxc_float vari; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_int4 tmpVal0, tmpVal1; - - int2 coord_bias = (int2)(0, 0); - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias); - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias); - for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = vload4(0, bias_ptr); - bias_f1 = vload4(1, bias_ptr); - scale_f0 = vload4(0, scale_ptr); - scale_f1 = vload4(1, scale_ptr); - bias_ptr += 8; - scale_ptr += 8; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - - vxc_float4 sub, norm; - sub = tmpData0 * input_scale - mean; - norm = scale_f0 * vari * sub + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - sub = tmpData1 * input_scale - mean; - norm = scale_f1 * vari * sub + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord, dst, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx deleted file mode 100644 index 8010726..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx +++ /dev/null @@ -1,237 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/**************************layernorm float16***********************************/ -_viv_uniform int width; -_viv_uniform float dimRatio; -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4; - -__kernel void layer_norm_F16F32toF16_2D( - image2d_t input, image2d_t bias, image2d_t scale, - image2d_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), 0, 0); - vxc_short8 src0, src1; - vxc_float sum = 0, sqr = 0; - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - - for(coord.x = 8; coord.x < (width+8); coord.x += 8) - { - vxc_half8 val0_h; - _viv_asm(COPY, val0_h, src0, 16); - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - sum += sumsqr.x; - sqr += sumsqr.y; - } - vxc_float mean; - mean = sum * dimRatio; - vxc_float vari; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_float4 bias_f, scale_f, in_f; - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw); - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw); - for(coord.x = 0; coord.x < width; coord.x += 4) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - bias_f = vload4(0, bias_ptr + coord.x); - scale_f = vload4(0, scale_ptr + coord.x); - - vxc_half8 in_h; - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - vxc_float4 sub, norm; - sub = in_f - mean; - norm = scale_f * vari * sub + bias_f; - half4 norm_h; - _viv_asm(CONV, norm_h, norm); - vxc_half8 dst; - VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniExtractHalf4_dp4x4); - vxc_short8 dstval; - _viv_asm(COPY, dstval, dst, 16); - VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); - } -} -/*****************************layernorm uint8 to uint8****************************/ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniSumU8_16x1; -_viv_uniform VXC_512Bits uniSqrSum_16x1; -_viv_uniform float input_scale; -_viv_uniform int inputZP; -_viv_uniform float outputScale; -_viv_uniform float output_zp; -_viv_uniform int sumInZp; -_viv_uniform int tmpZp1; -_viv_uniform int tmpZp2; -_viv_uniform float e2InScale; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; -_viv_uniform float dimRatio_scale; - -__kernel void layer_norm_U8F32toU8_2D( - image2d_t input, image2d_t bias, image2d_t scale, - image2d_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), 0, 0); - vxc_uchar16 src0, src2; - float sum = 0, sqr = 0; - vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3; - int tmpSum = 0, tmpSqr = 0; - vxc_int4 tmpSum1; - vxc_int4 tmpSqr1; - short zp = inputZP; - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1.x); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); - } - sum = (tmpSum + sumInZp) * input_scale; - sqr = (tmpSqr + tmpZp2) * e2InScale; - - float mean, vari; - mean = sum * dimRatio; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_int4 tmpVal0, tmpVal1; - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw); - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw); - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - bias_f0 = vload4(0, bias_ptr); - bias_f1 = vload4(1, bias_ptr); - bias_f2 = vload4(2, bias_ptr); - bias_f3 = vload4(3, bias_ptr); - scale_f0 = vload4(0, scale_ptr); - scale_f1 = vload4(1, scale_ptr); - scale_f2 = vload4(2, scale_ptr); - scale_f3 = vload4(3, scale_ptr); - bias_ptr += 16; - scale_ptr += 16; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - tmpData0 = tmpData0 * input_scale - mean; - tmpData1 = tmpData1 * input_scale - mean; - tmpData2 = tmpData2 * input_scale - mean; - tmpData3 = tmpData3 * input_scale - mean; - - vxc_float4 norm; - norm = scale_f0 * vari * tmpData0 + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - - norm = scale_f1 * vari * tmpData1 + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - - norm = scale_f2 * vari * tmpData2 + bias_f2; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - - norm = scale_f3 * vari * tmpData3 + bias_f3; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - } -} - -__kernel void layer_norm_I16F32toI16_2D( - image2d_t input, image2d_t bias, image2d_t scale, - image2d_t output, float eps) -{ - int4 coord = (int4)(0, get_global_id(1), 0, 0); - - vxc_short8 src0, src1, dst; - vxc_float sum = 0, sqr = 0; - for(; coord.x < width;) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.x += 8; - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - sum += sumsqr.x; - sqr = sqr + sumsqr.y * e2InScale; - } - vxc_float mean, vari; - mean = sum * dimRatio_scale; - vari = sqr * dimRatio - mean * mean; - vari += eps; - vari = rsqrt(vari); - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_half8 scale_h; - vxc_int4 tmpVal0, tmpVal1; - - Image img1 = create_image_from_image2d(bias, 4); - Image img2 = create_image_from_image2d(scale, 4); - - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw); - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw); - for(coord.x = 0; coord.x < width; coord.x += 8) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = vload4(0, bias_ptr); - bias_f1 = vload4(1, bias_ptr); - scale_f0 = vload4(0, scale_ptr); - scale_f1 = vload4(1, scale_ptr); - bias_ptr += 8; - scale_ptr += 8; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - - vxc_float4 sub, norm; - sub = tmpData0 * input_scale - mean; - norm = scale_f0 * vari * sub + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - sub = tmpData1 * input_scale - mean; - norm = scale_f1 * vari * sub + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx deleted file mode 100644 index a76cb4f..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx +++ /dev/null @@ -1,239 +0,0 @@ -#include "cl_viv_vx_ext.h" - -/*****************************layernorm uint8 to fp16****************************/ -_viv_uniform int width; -_viv_uniform float dimRatio; -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniSumU8_16x1; -_viv_uniform VXC_512Bits uniSqrSum_16x1; -_viv_uniform float input_scale; -_viv_uniform int inputZP; -_viv_uniform int sumInZp; -_viv_uniform int tmpZp1; -_viv_uniform int tmpZp2; -_viv_uniform float e2InScale; -_viv_uniform VXC_512Bits UniPackFP16even_2x8; - -__kernel void layer_norm_U8toF16( - image2d_array_t input, - image2d_t bias, - image2d_t scale, - image2d_array_t output, - float eps) -{ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); - int4 coord_out = coord; - vxc_uchar16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0; - vxc_int4 tmpSum1; - vxc_int4 tmpSqr1; - - int8 input_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.z, baseAddr_a); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord_out.z, baseAddr); - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1.x); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); - } - sum = (tmpSum + sumInZp) * input_scale; - sqr = (tmpSqr + tmpZp2) * e2InScale; - - float mean, vari; - mean = sum * dimRatio; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - int2 coord_bias = (int2)(0, 0); - vxc_half8 scale_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_short8 src1, outval; - short zp = inputZP; - half4 tmpVal0, tmpVal1; - vxc_half8 dst; - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_bias.x = coord.x; - - scale_f0 = read_imagef(scale, coord_bias); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - scale_f1 = read_imagef(scale, coord_bias); - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - tmpData0 *= input_scale; - tmpData1 *= input_scale; - tmpData2 *= input_scale; - tmpData3 *= input_scale; - - vxc_float4 norm; - tmpData0 -= mean; - norm = scale_f0 * vari * tmpData0 + bias_f0; - - scale_f0 = read_imagef(scale, coord_bias); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - _viv_asm(CONV, tmpVal0, norm); - - tmpData1 -= mean; - norm = scale_f1 * vari * tmpData1 + bias_f1; - - scale_f1 = read_imagef(scale, coord_bias); - bias_f1 = read_imagef(bias, coord_bias); - - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - UniPackFP16even_2x8); - _viv_asm(COPY, outval, dst, 16); - coord_out.x = coord.x; - VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - - tmpData2 -= mean; - norm = scale_f0 * vari * tmpData2 + bias_f0; - _viv_asm(CONV, tmpVal0, norm); - - tmpData3 -= mean; - norm = scale_f1 * vari * tmpData3 + bias_f1; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - UniPackFP16even_2x8); - _viv_asm(COPY, outval, dst, 16); - coord_out.x += 8; - VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel void layer_norm_U8toF16_2D( - image2d_t input, - image2d_t bias, - image2d_t scale, - image2d_t output, - float eps) -{ - int4 coord = (int4)(0, get_global_id(1), 0, 0); - vxc_uchar16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0; - vxc_int4 tmpSum1; - vxc_int4 tmpSqr1; - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1.x); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x); - } - sum = (tmpSum + sumInZp) * input_scale; - sqr = (tmpSqr + tmpZp2) * e2InScale; - - float mean, vari; - mean = sum * dimRatio; - vari = sqr*dimRatio - mean*mean; - vari += eps; - vari = rsqrt(vari); - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3; - int2 coord_bias = (int2)(0, 0); - vxc_half8 scale_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_short8 src1, outval; - short zp = inputZP; - half4 tmpVal0, tmpVal1; - vxc_half8 dst; - - int2 coord_out = (int2)(get_global_id(0), get_global_id(1)); - - for(coord.x = 0; coord.x < width; coord.x += 16) - { - coord_bias.x = coord.x; - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - - scale_f0 = read_imagef(scale, coord_bias); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - scale_f1 = read_imagef(scale, coord_bias); - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert3rdUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert4thUint8SubZpToFp32_4x4); - tmpData0 *= input_scale; - tmpData1 *= input_scale; - tmpData2 *= input_scale; - tmpData3 *= input_scale; - - vxc_float4 norm; - tmpData0 -= mean; - norm = scale_f0 * vari * tmpData0 + bias_f0; - - scale_f0 = read_imagef(scale, coord_bias); - bias_f0 = read_imagef(bias, coord_bias); - - coord_bias.x += 4; - _viv_asm(CONV, tmpVal0, norm); - - tmpData1 -= mean; - norm = scale_f1 * vari * tmpData1 + bias_f1; - - scale_f1 = read_imagef(scale, coord_bias); - bias_f1 = read_imagef(bias, coord_bias); - - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - UniPackFP16even_2x8); - _viv_asm(COPY, outval, dst, 16); - coord_out.x = coord.x; - VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - - tmpData2 -= mean; - norm = scale_f0 * vari * tmpData2 + bias_f0; - _viv_asm(CONV, tmpVal0, norm); - - tmpData3 -= mean; - norm = scale_f1 * vari * tmpData3 + bias_f1; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - UniPackFP16even_2x8); - _viv_asm(COPY, outval, dst, 16); - coord_out.x += 8; - VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx deleted file mode 100644 index d494b6d..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_f16.vx +++ /dev/null @@ -1,430 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2; -_viv_uniform int width; - -_viv_uniform int height; - -_viv_uniform int height_depth; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform float outputScale; -_viv_uniform float output_zp; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32( - image2d_array_t input, image2d_t output) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); - vxc_short8 src0; - vxc_half8 in_h; - vxc_float4 sumsqr; - vxc_float4 tmpSumSqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.w, baseAddr_a); - - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - _viv_asm(COPY, in_h, src0, 16); - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - tmpSumSqr += sumsqr; - } - } - - lcl_sum[lidx] = tmpSumSqr.x; - lcl_sqr[lidx] = tmpSumSqr.y; - barrier(CLK_LOCAL_MEM_FENCE); - - int2 coord_out = (int2)(get_group_id(0) << 2, gidz); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - float sum = 0; - float sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D( - image2d_array_t input, image2d_t output) -{ - int gidx = get_global_id(0) << 3; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int gidy = gidz * height; - - int2 coord = (int2)(gidx, gidy); - vxc_short8 src0; - vxc_half8 in_h; - vxc_float4 sumsqr; - vxc_float4 tmpSumSqr = (vxc_float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int endH = gidy + height; - if(gidx < width) - { - for(; coord.y < endH;) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - _viv_asm(COPY, in_h, src0, 16); - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniFp16SumSqr_dp8x2); - tmpSumSqr += sumsqr; - } - } - - lcl_sum[lidx] = tmpSumSqr.x; - lcl_sqr[lidx] = tmpSumSqr.y; - barrier(CLK_LOCAL_MEM_FENCE); - - int2 coord_out = (int2)(get_group_id(0) << 2, gidz); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - - float sum = 0; - float sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int2 coord_sum = (int2)(0, gidz); - int4 coord_para = coord; - coord_para.z = (ushort)gidz / (ushort)(height_depth); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_sum); - coord_sum.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - int4 coord_bias = coord_para; - - int8 input_desc, scale_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; - _viv_asm(MOV, coord_para.w, baseAddr_c); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - vxc_half8 dst; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - coord_para.y = coord.y; - coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - vxc_float4 sub, norm; - sub = tmpData0 - mean_vari.s0; - norm = scale_f0 * mean_vari.s1 * sub + bias_f0; - _viv_asm(CONV, tmpVal0, norm); - sub = tmpData1 - mean_vari.s0; - norm = scale_f1 * mean_vari.s1 * sub + bias_f1; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D( - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_t output, float eps) -{ - int2 coord = (int2)(get_global_id(0), 0); - int2 coord_bias = (int2)(0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_bias); - coord_bias.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - coord_bias = coord; - - vxc_float4 tmpData0, tmpData1; - vxc_short8 outval; - half4 tmpVal0, tmpVal1; - vxc_half8 dst; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.y = coord.y; - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - vxc_float4 sub, norm; - sub = tmpData0 - mean_vari.s0; - norm = scale_f0 * mean_vari.s1 * sub + bias_f0; - _viv_asm(CONV, tmpVal0, norm); - sub = tmpData1 - mean_vari.s0; - norm = scale_f1 * mean_vari.s1 * sub + bias_f1; - _viv_asm(CONV, tmpVal1, norm); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int2 coord_sum = (int2)(0, gidz); - int4 coord_para = coord; - coord_para.z = (ushort)gidz / (ushort)(height_depth); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_sum); - coord_sum.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - int4 coord_bias = coord_para; - - int8 input_desc, scale_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; - _viv_asm(MOV, coord_para.w, baseAddr_c); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - vxc_float4 tmpData0, tmpData1; - vxc_uchar16 outval; - vxc_int4 tmpVal0, tmpVal1; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - coord_para.y = coord.y; - coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - vxc_float4 sub, norm; - sub = tmpData0 - mean_vari.s0; - norm = scale_f0 * mean_vari.s1 * sub + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - sub = tmpData1 - mean_vari.s0; - norm = scale_f1 * mean_vari.s1 * sub + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D( - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_t output, float eps) -{ - int2 coord = (int2)(get_global_id(0), 0); - int2 coord_bias = (int2)(0, 0); - vxc_short8 src0; - vxc_short8 src1; - vxc_half8 scale_h, in_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_bias); - coord_bias.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - coord_bias = coord; - - vxc_float4 tmpData0, tmpData1; - vxc_uchar16 outval; - vxc_int4 tmpVal0, tmpVal1; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.y = coord.y; - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, in_h, src0, 16); - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - - vxc_float4 sub, norm; - sub = tmpData0 - mean_vari.s0; - norm = scale_f0 * mean_vari.s1 * sub + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - sub = tmpData1 - mean_vari.s0; - norm = scale_f1 * mean_vari.s1 * sub + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx deleted file mode 100644 index 7c92a66..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_i16.vx +++ /dev/null @@ -1,268 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2; -_viv_uniform float e2InScale; -_viv_uniform int width; - -_viv_uniform float input_scale; -_viv_uniform int height; - -_viv_uniform int height_depth; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform float outputScale; -_viv_uniform float output_zp; - -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform int inputZP; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32( - image2d_array_t input, image2d_t output) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); - vxc_short8 src0; - float4 tmpSumSqr = (float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.w, baseAddr_a); - - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - tmpSumSqr += sumsqr; - } - tmpSumSqr.x *= input_scale; - tmpSumSqr.y *= e2InScale; - } - lcl_sum[lidx] = tmpSumSqr.x; - lcl_sqr[lidx] = tmpSumSqr.y; - barrier(CLK_LOCAL_MEM_FENCE); - - int2 coord_out = (int2)(get_group_id(0) << 2, gidz); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - float4 data = (float4)(0); - for(int i = 0; i < 4; i++) - { - data.x += dot(tmp_sum[i], one); - data.y += dot(tmp_sqr[i], one); - } - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D( - image2d_t input, image2d_t output) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int gidy = gidz * height; - - int2 coord = (int2)(gidx, gidy); - vxc_short8 src0; - float4 tmpSumSqr = (float4)(0); - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int endH = gidy + height; - if(gidx < width) - { - for(; coord.y < endH;) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord.y++; - vxc_float4 sumsqr; - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\ - uniInt16SumSqr_dp8x2); - tmpSumSqr += sumsqr; - } - tmpSumSqr.x *= input_scale; - tmpSumSqr.y *= e2InScale; - } - lcl_sum[lidx] = tmpSumSqr.x; - lcl_sqr[lidx] = tmpSumSqr.y; - barrier(CLK_LOCAL_MEM_FENCE); - - int2 coord_out = (int2)(get_group_id(0) << 2, gidz); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - float4 data = (float4)(0); - for(int i = 0; i < 4; i++) - { - data.x += dot(tmp_sum[i], one); - data.y += dot(tmp_sqr[i], one); - } - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int2 coord_sum = (int2)(0, gidz); - int4 coord_para = coord; - coord_para.z = (ushort)gidz / (ushort)(height_depth); - vxc_short8 src0, src1, outval; - vxc_half8 scale_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_sum); - coord_sum.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - int4 coord_bias = coord_para; - - int8 input_desc, scale_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; - _viv_asm(MOV, coord_para.w, baseAddr_c); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, norm; - vxc_int4 tmpVal0, tmpVal1; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - coord_para.y = coord.y; - coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - tmpData0 = tmpData0 * input_scale - mean_vari.s0; - tmpData1 = tmpData1 * input_scale - mean_vari.s0; - - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D( - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_t output, float eps) -{ - int2 coord = (int2)(get_global_id(0), 0); - int2 coord_bias = (int2)(0, 0); - vxc_short8 src0, src1, outval; - vxc_half8 scale_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_bias); - coord_bias.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - coord_bias = coord; - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, norm; - vxc_int4 tmpVal0, tmpVal1; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.y = coord.y; - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - tmpData0 = tmpData0 * input_scale - mean_vari.s0; - tmpData1 = tmpData1 * input_scale - mean_vari.s0; - - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx deleted file mode 100644 index 4c9e46b..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_wh_u8.vx +++ /dev/null @@ -1,423 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniSumU8_16x1; -_viv_uniform VXC_512Bits uniSqrSum_16x1; -_viv_uniform int sumInZp; -_viv_uniform int tmpZp1; -_viv_uniform float e2InScale; -_viv_uniform float rowSumScale; -_viv_uniform int width; - -_viv_uniform float input_scale; -_viv_uniform int height; - -_viv_uniform int height_depth; -_viv_uniform float dimRatio; -_viv_uniform int group_num; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; -_viv_uniform float outputScale; -_viv_uniform float output_zp; - -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; -_viv_uniform int inputZP; - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32( - image2d_array_t input, image2d_t output) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int4 coord = (int4)(gidx, 0, gidz, 0); - vxc_uchar16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int8 input_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord.w, baseAddr_a); - - if(gidx < width) - { - for(coord.y = 0; coord.y < height;) - { - VXC_OP4(img_load_3d, src0, input, coord.xywz, 0, - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); - } - sqr += (tmpSqr * e2InScale + rowSumScale); - sum = (tmpSum + sumInZp) * input_scale; - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int2 coord_out = (int2)(get_group_id(0) << 2, gidz); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D( - image2d_t input, image2d_t output) -{ - int gidx = get_global_id(0) << 4; - int lidx = get_local_id(0); - int gidz = get_global_id(1); - int gidy = gidz * height; - - int2 coord = (int2)(gidx, gidy); - vxc_uchar16 src0; - float sum = 0, sqr = 0; - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1; - - __local float lcl_sum[16]; - __local float lcl_sqr[16]; - - int endH = gidy + height; - if(gidx < width) - { - for(; coord.y < endH;) - { - VXC_ReadImage(src0, input, coord, 0, - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord.y++; - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1); - tmpSum += (tmpSum1); - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1); - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1); - } - sqr += (tmpSqr * e2InScale + rowSumScale); - sum = (tmpSum + sumInZp) * input_scale; - } - lcl_sum[lidx] = sum; - lcl_sqr[lidx] = sqr; - barrier(CLK_LOCAL_MEM_FENCE); - - int2 coord_out = (int2)(get_group_id(0) << 2, gidz); - if(lidx == 0) - { - float4 one = (float4)(1, 1, 1, 1); - __local float4* tmp_sum = (__local float4*)lcl_sum; - __local float4* tmp_sqr = (__local float4*)lcl_sqr; - sum = 0; sqr = 0; - for(int i = 0; i < 4; i++) - { - sum += dot(tmp_sum[i], one); - sqr += dot(tmp_sqr[i], one); - } - float4 data = (float4)(sum, sqr, 0, 0); - write_imagef(output, coord_out, data); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int2 coord_sum = (int2)(0, gidz); - int4 coord_para = coord; - coord_para.z = (ushort)gidz / (ushort)(height_depth); - vxc_uchar16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_sum); - coord_sum.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - int4 coord_bias = coord_para; - - int8 input_desc, scale_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; - _viv_asm(MOV, coord_para.w, baseAddr_c); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, norm; - half4 tmpVal0, tmpVal1; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, 0, - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - coord_para.y = coord.y; coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0, - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - tmpData0 = tmpData0 * input_scale - mean_vari.s0; - tmpData1 = tmpData1 * input_scale - mean_vari.s0; - - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; - _viv_asm(CONV, tmpVal1, norm); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D( - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_t output, float eps) -{ - int2 coord = (int2)(get_global_id(0), 0); - int2 coord_bias = (int2)(0, 0); - vxc_uchar16 src0; - vxc_short8 src1, outval; - vxc_half8 scale_h, dst; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_bias); - coord_bias.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - coord_bias = coord; - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, norm; - half4 tmpVal0, tmpVal1; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_ReadImage(src0, input, coord, 0,\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.y = coord.y; - VXC_ReadImage(src1, scale, coord, 0,\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - tmpData0 = tmpData0 * input_scale - mean_vari.s0; - tmpData1 = tmpData1 * input_scale - mean_vari.s0; - - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; - _viv_asm(CONV, tmpVal0, norm); - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; - _viv_asm(CONV, tmpVal1, norm); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniConvertHalfToFp16_2x8); - _viv_asm(COPY, outval, dst, 16); - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8( - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, - image2d_array_t output, float eps) -{ - int gidz = get_global_id(1); - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); - int2 coord_sum = (int2)(0, gidz); - int4 coord_para = coord; - coord_para.z = (ushort)gidz / (ushort)(height_depth); - vxc_uchar16 src0 , outval; - vxc_short8 src1; - vxc_half8 scale_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_sum); - coord_sum.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - int4 coord_bias = coord_para; - - int8 input_desc, scale_desc, output_desc; - _viv_asm(COPY, input_desc, input, sizeof(input_desc)); - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; - _viv_asm(MOV, coord_in.z, baseAddr_a); - - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; - _viv_asm(MOV, coord_para.w, baseAddr_c); - - _viv_asm(COPY, output_desc, output, sizeof(output_desc)); - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; - _viv_asm(MOV, coord.z, baseAddr); - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, norm; - vxc_int4 tmpVal0, tmpVal1; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_OP4(img_load_3d, src0, input, coord_in, 0, - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_in.y ++; - coord_para.y = coord.y; - coord_bias.y = coord.y; - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0, - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - tmpData0 = tmpData0 * input_scale - mean_vari.s0; - tmpData1 = tmpData1 * input_scale - mean_vari.s0; - - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); - } -} - -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D( - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, - image2d_t output, float eps) -{ - int2 coord = (int2)(get_global_id(0), 0); - int2 coord_bias = (int2)(0, 0); - vxc_uchar16 src0, outval; - vxc_short8 src1; - vxc_half8 scale_h; - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; - vxc_float4 mean_vari = (vxc_float4)(0); - - for(int i = 0; i < group_num; i++) - { - mean_vari += read_imagef(meanVari, coord_bias); - coord_bias.x += 4; - } - mean_vari *= dimRatio; - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; - mean_vari.s1 = rsqrt(mean_vari.s1); - - coord_bias = coord; - - short zp = inputZP; - vxc_float4 tmpData0, tmpData1, norm; - vxc_int4 tmpVal0, tmpVal1; - - for(coord.y = 0; coord.y < height; coord.y++) - { - VXC_ReadImage(src0, input, coord, 0,\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - coord_bias.y = coord.y; - VXC_ReadImage(src1, scale, coord, 0,\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - bias_f0 = read_imagef(bias, coord_bias); - coord_bias.x += 4; - bias_f1 = read_imagef(bias, coord_bias); - coord_bias.x = coord.x; - - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert1stUint8SubZpToFp32_4x4); - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvert2ndUint8SubZpToFp32_4x4); - tmpData0 = tmpData0 * input_scale - mean_vari.s0; - tmpData1 = tmpData1 * input_scale - mean_vari.s0; - - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp); - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp); - - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - } -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx index 433dc4f..a25eb64 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_bf16.vx @@ -76,6 +76,7 @@ __kernel void gemm_BF16BF16toBF16(image2d_array_t inputA, sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); } coord_b.y = gidy; + coord_b.z = get_global_id(2); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_b.w, baseAddr); @@ -153,6 +154,7 @@ __kernel void gemm_transa_BF16BF16toBF16( sum3 = (sum3 + tempA0.w * tempB0); } coord_b.y = gidy; + coord_b.z = get_global_id(2); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_b.w, baseAddr); @@ -187,7 +189,7 @@ __kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA, int adjointB, uint M, uint K, uint N) { - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx index bd211d4..bdedc7a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16.vx @@ -82,6 +82,7 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA, sum3 += (tempA3); } coord_b.y = gidy; + coord_b.z = get_global_id(2); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_b.w, baseAddr); @@ -170,6 +171,7 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA, sum3 += (tempA3 + tempB3); } coord_b.y = gidy; + coord_b.z = get_global_id(2); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_b.w, baseAddr); @@ -244,7 +246,7 @@ __kernel void gemm_F32F32toF32( sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3); sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); } - coord_b = (int4)(gidx, gidy, get_global_id(2), 0); + coord_b = (int4)(gidx, gidy, get_global_id(2), get_global_id(2)); write_imagef(output, coord_b, sum0); coord_b.y++; write_imagef(output, coord_b, sum1); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx index 1929119..0e9bcd5 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16f16_u8.vx @@ -82,6 +82,7 @@ __kernel void gemm_F16F16to##dst_type_name( \ vxc_int4 tmpOut0, tmpOut1; \ write_type outC; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ @@ -171,6 +172,7 @@ __kernel void gemm_F16F16to##dst_type_name( \ vxc_int4 tmpOut0, tmpOut1; \ write_type outC; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx index 7cdf087..520f70d 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16i16_i16.vx @@ -79,6 +79,7 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \ } \ vxc_int4 tmpOut0, tmpOut1; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ @@ -167,6 +168,7 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \ } \ vxc_int4 tmpOut0, tmpOut1; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx index 515d2fb..322f474 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_f16.vx @@ -75,6 +75,7 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \ sum2 *= input1Scale; \ sum3 *= input1Scale; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ @@ -171,6 +172,7 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \ sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \ } \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx index 39ddada..b9e803e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_f16u8_u8.vx @@ -70,6 +70,7 @@ __kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t input } \ vxc_int4 tmpOut0, tmpOut1; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx index 7792e92..b4db308 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx @@ -79,6 +79,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ } \ vxc_int4 tmpOut0, tmpOut1; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx index 2fb3d26..d55fa59 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transA.vx @@ -65,6 +65,7 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \ } \ vxc_int4 tmpOut0, tmpOut1; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ @@ -150,6 +151,7 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \ } \ vxc_int4 tmpOut0, tmpOut1; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ @@ -232,6 +234,7 @@ __kernel void gemm_transa_F16F16toF16( sum3 = (sum3 + tempA0.w * tempB0); } coord_b.y = gidy; + coord_b.z = get_global_id(2); _viv_asm(COPY, output_desc, output, sizeof(output_desc)); int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; _viv_asm(MOV, coord_b.w, baseAddr); @@ -257,4 +260,4 @@ __kernel void gemm_transa_F16F16toF16( _viv_asm(COPY, outC, valC, 16); VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); -} \ No newline at end of file +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx index 8548fe7..2e7aab1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16.vx @@ -15,7 +15,7 @@ __kernel void gemm_transb_F16F16toF16(image2d_array_t inputA, int adjointB, uint M, uint K, uint N) { - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx index 1c6ad3d..e7be4f4 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_f16_mix.vx @@ -148,7 +148,7 @@ __kernel void gemm_transb_F16U8toU8(image2d_array_t inputA, int adjointB, uint M, uint K, uint N) { - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx index 71bd242..50e992f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_transB_u8_mix.vx @@ -19,7 +19,7 @@ __kernel void gemm_transb_U8U8toF16(image2d_array_t inputA, int adjointB, uint M, uint K, uint N) { - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); @@ -154,7 +154,7 @@ __kernel void gemm_transb_U8U8toU8(image2d_array_t inputA, int adjointB, uint M, uint K, uint N) { - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0); int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx index 1b1e92f..64fe053 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8.vx @@ -74,6 +74,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \ } \ vxc_int4 tmpOut0, tmpOut1; \ coord_b.y = get_global_id(1); \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx index 021ff4b..f8aa096 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_f16.vx @@ -85,6 +85,7 @@ __kernel void gemm_##src0_type_name##F16toF16( \ sum2 *= input0Scale; \ sum3 *= input0Scale; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ @@ -190,6 +191,7 @@ __kernel void gemm_##src0_type_name##F16toF16( \ sum2 *= input0Scale; \ sum3 *= input0Scale; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx index 6cdf89e..7617e4e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8f16_u8.vx @@ -79,6 +79,7 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \ } \ vxc_int4 tmpOut0, tmpOut1; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ @@ -177,6 +178,7 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \ } \ vxc_int4 tmpOut0, tmpOut1; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx index 3816d56..ce3838e 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_u8u8_f16.vx @@ -10,6 +10,7 @@ _viv_uniform int bc2zero; _viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4; _viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4; _viv_uniform float input01Scale; +_viv_uniform float mulKIn0In1Zp; #define GEMM_QINT_TO_F16(src0_type_name, read_type) \ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \ @@ -23,10 +24,8 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \ \ int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \ - vxc_float4 sum0 = (vxc_float4)(0); \ - vxc_float4 sum1 = (vxc_float4)(0); \ - vxc_float4 sum2 = (vxc_float4)(0); \ - vxc_float4 sum3 = (vxc_float4)(0); \ + vxc_float4 sum0 = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \ + vxc_float4 sum1 = sum0, sum2 = sum0, sum3 = sum0; \ \ int8 inputA_desc, inputB_desc, output_desc; \ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \ @@ -84,6 +83,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \ sum2 *= input01Scale; \ sum3 *= input01Scale; \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ @@ -185,6 +185,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \ sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \ } \ coord_b.y = gidy; \ + coord_b.z = get_global_id(2); \ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \ _viv_asm(MOV, coord_b.w, baseAddr); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/mod.vx b/src/tim/vx/internal/src/libnnext/ops/vx/mod.vx new file mode 100644 index 0000000..163c840 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/mod.vx @@ -0,0 +1,185 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; +_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4; + +_viv_uniform float in_scale0; +_viv_uniform float in_scale1; +_viv_uniform float out_scale; +_viv_uniform float in0Tail; +_viv_uniform float in1Tail; +_viv_uniform float out_zp; + +#define MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \ + save_type data; \ + read_type read_data0, read_data1; \ + copy_type tmpData0, tmpData1; \ + vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \ + vxc_float4 tmpVal1, tmpVal2; \ + dst_type tmpOut1, tmpOut2; \ + read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmpData0, read_data0, 16); \ + read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmpData1, read_data1, 16); \ + VXC_DP4x4(in0Val1, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \ + VXC_DP4x4(in0Val2, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \ + VXC_DP4x4(in1Val1, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \ + VXC_DP4x4(in1Val2, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \ + in0Val1 = in0Val1 * IN0_SCALE + IN0_TAIL; \ + in0Val2 = in0Val2 * IN0_SCALE + IN0_TAIL; \ + in1Val1 = in1Val1 * IN1_SCALE + IN1_TAIL; \ + in1Val2 = in1Val2 * IN1_SCALE + IN1_TAIL; \ + if (isfmod) \ + { \ + tmpVal1 = fmod(in0Val1, in1Val1) * OUT_SCALE + OUT_OFFSET; \ + tmpVal2 = fmod(in0Val2, in1Val2) * OUT_SCALE + OUT_OFFSET; \ + } \ + else \ + { \ + tmpVal1 = (in0Val1 - in1Val1 * floor(in0Val1 / in1Val1)) * OUT_SCALE + OUT_OFFSET; \ + tmpVal2 = (in0Val2 - in1Val2 * floor(in0Val2 / in1Val2)) * OUT_SCALE + OUT_OFFSET; \ + } \ + _viv_asm(conv_mode, tmpOut1, tmpVal1); \ + _viv_asm(conv_mode, tmpOut2, tmpVal2); \ + VXC_DP2x8(data, tmpOut1, tmpOut2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + +#define TENSOR_MOD(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \ + conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \ +__kernel void mod_##src0_name##src1_name##to##dst_name \ + ( \ + image2d_array_t input0, \ + image2d_array_t input1, \ + image2d_array_t output, \ + int isfmod \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \ +} + + +TENSOR_MOD(F16, F16, F16, half4, vxc_short8, vxc_short8,\ + vxc_half8, CONV, 1, 0, 1, 0, 1, 0) +TENSOR_MOD(F16, F16, I16, short4, vxc_short8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp) +TENSOR_MOD(F16, F16, I8, char4, vxc_char8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp) +TENSOR_MOD(F16, F16, U8, uchar4, vxc_uchar8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp) + +TENSOR_MOD(I16, I16, I16, short4, vxc_short8, vxc_short8,\ + vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp) +TENSOR_MOD(I16, I16, F16, half4, vxc_short8, vxc_short8,\ + vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0) + +TENSOR_MOD(I8, I8, I8, char4, vxc_char8, vxc_char16,\ + vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp) +TENSOR_MOD(I8, I8, F16, half4, vxc_short8, vxc_char16,\ + vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0) + +TENSOR_MOD(U8, U8, U8, uchar4, vxc_uchar8, vxc_uchar16,\ + vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp) +TENSOR_MOD(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\ + vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0) + + +#define TENSOR_MOD_2D(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \ + conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \ +__kernel void mod_##src0_name##src1_name##to##dst_name##_2D \ + ( \ + image2d_array_t input0, \ + image2d_array_t input1, \ + image2d_array_t output, \ + int isfmod \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \ +} + + +TENSOR_MOD_2D(F16, F16, F16, half4, vxc_short8, vxc_short8,\ + vxc_half8, CONV, 1, 0, 1, 0, 1, 0) +TENSOR_MOD_2D(F16, F16, I16, short4, vxc_short8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp) +TENSOR_MOD_2D(F16, F16, I8, char4, vxc_char8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp) +TENSOR_MOD_2D(F16, F16, U8, uchar4, vxc_uchar8, vxc_short8,\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp) + +TENSOR_MOD_2D(I16, I16, I16, short4, vxc_short8, vxc_short8,\ + vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp) +TENSOR_MOD_2D(I16, I16, F16, half4, vxc_short8, vxc_short8,\ + vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0) + +TENSOR_MOD_2D(I8, I8, I8, char4, vxc_char8, vxc_char16,\ + vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp) +TENSOR_MOD_2D(I8, I8, F16, half4, vxc_short8, vxc_char16,\ + vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0) + +TENSOR_MOD_2D(U8, U8, U8, uchar4, vxc_uchar8, vxc_uchar16,\ + vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp) +TENSOR_MOD_2D(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\ + vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0) + + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +#define MOD_BF16_PROCESS(read_fun, write_fun) \ + vxc_short8 read_data0, read_data1, vec0; \ + vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \ + vxc_float4 tmpVal1, tmpVal2; \ + vxc_ushort8 dst0, dst1; \ + vxc_ushort8 vect; \ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \ + read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, in0Val1, vec0, 16); \ + VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, in0Val2, vec0, 16); \ + read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \ + _viv_asm(COPY, in1Val1, vec0, 16); \ + VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \ + _viv_asm(COPY, in1Val2, vec0, 16); \ + tmpVal1 = fmod(in0Val1, in1Val1); \ + tmpVal2 = fmod(in0Val2, in1Val2); \ + _viv_asm(COPY, dst0, tmpVal1, 16); \ + _viv_asm(COPY, dst1, tmpVal2, 16); \ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void mod_BF16BF16toBF16 + ( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output, + int isfmod + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + MOD_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray); +} + +__kernel void mod_BF16BF16toBF16_2D + ( + image2d_array_t input0, + image2d_array_t input1, + image2d_array_t output, + int isfmod + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + MOD_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx new file mode 100644 index 0000000..19873f1 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pow.vx @@ -0,0 +1,247 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; +_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; +_viv_uniform VXC_512Bits uniExtact8Bit_2x8; +_viv_uniform float input0_scale; +_viv_uniform float input1_scale; +_viv_uniform float input0_tail; +_viv_uniform float input1_tail; +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define POW_SH_IMPL(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, copy2_type, conv_type) \ +__kernel void pow_##name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \ + \ + src0_type src0; \ + copy0_type data0; \ + src0_type src1; \ + copy0_type data1; \ + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data0, src0, 16); \ + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data1, src1, 16); \ + float4 x0, x1; \ + float4 y0, y1; \ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \ + x0 = x0 * input0_scale + input0_tail; \ + x1 = x1 * input0_scale + input0_tail; \ + y0 = y0 * input1_scale + input1_tail; \ + y1 = y1 * input1_scale + input1_tail; \ + float4 s0 = sign(x0); \ + float4 s1 = sign(x1); \ + int4 t0 = convert_int4(y0) & 1; \ + int4 t1 = convert_int4(y1) & 1; \ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; \ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; \ + x0 = s0 * exp2(y0 * log2(fabs(x0))); \ + x1 = s1 * exp2(y1 * log2(fabs(x1))); \ + x0 = x0 * output_scale + output_zp; \ + x1 = x1 * output_scale + output_zp; \ + \ + conv_type tmpVal0, tmpVal1; \ + _viv_asm(CONV_RTE, tmpVal0, x0); \ + _viv_asm(CONV_RTE, tmpVal1, x1); \ + dst_type dst0; \ + \ + copy2_type dst; \ + VXC_DP2x8(dst0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + _viv_asm(COPY, dst, dst0, 16); \ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +POW_SH_IMPL(F16_F16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(F16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_short8, vxc_short8, int4) +POW_SH_IMPL(F16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, int4) +POW_SH_IMPL(F16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, int4) +POW_SH_IMPL(F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, int4) +POW_SH_IMPL(I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8, int4) +POW_SH_IMPL(I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, int4) +POW_SH_IMPL(F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char16, vxc_char16, int4) +POW_SH_IMPL(I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, int4) +POW_SH_IMPL(I8_I8toF16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(I8_I8toI8, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, int4) +POW_SH_IMPL(F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4) +POW_SH_IMPL(U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, int4) +POW_SH_IMPL(U8_U8toF16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8, half4) +POW_SH_IMPL(U8_U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4) + +#define POW_SH_IMPL_2D(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, copy2_type, conv_type) \ +__kernel void pow_##name##_2D \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + src0_type src0; \ + copy0_type data0; \ + src0_type src1; \ + copy0_type data1; \ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data0, src0, 16); \ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, data1, src1, 16); \ + float4 x0, x1; \ + float4 y0, y1; \ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \ + x0 = x0 * input0_scale + input0_tail; \ + x1 = x1 * input0_scale + input0_tail; \ + y0 = y0 * input1_scale + input1_tail; \ + y1 = y1 * input1_scale + input1_tail; \ + float4 s0 = sign(x0); \ + float4 s1 = sign(x1); \ + int4 t0 = convert_int4(y0) & 1; \ + int4 t1 = convert_int4(y1) & 1; \ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; \ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; \ + x0 = s0 * exp2(y0 * log2(fabs(x0))); \ + x1 = s1 * exp2(y1 * log2(fabs(x1))); \ + x0 = x0 * output_scale + output_zp; \ + x1 = x1 * output_scale + output_zp; \ + \ + conv_type tmpVal0, tmpVal1; \ + _viv_asm(CONV_RTE, tmpVal0, x0); \ + _viv_asm(CONV_RTE, tmpVal1, x1); \ + dst_type dst0; \ + \ + copy2_type dst; \ + VXC_DP2x8(dst0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \ + _viv_asm(COPY, dst, dst0, 16); \ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +POW_SH_IMPL_2D(F16_F16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(F16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_short8, vxc_short8, int4) +POW_SH_IMPL_2D(F16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, int4) +POW_SH_IMPL_2D(F16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, int4) +POW_SH_IMPL_2D(F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, int4) +POW_SH_IMPL_2D(I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8, int4) +POW_SH_IMPL_2D(I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, int4) +POW_SH_IMPL_2D(F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char16, vxc_char16, int4) +POW_SH_IMPL_2D(I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, int4) +POW_SH_IMPL_2D(I8_I8toF16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(I8_I8toI8, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, int4) +POW_SH_IMPL_2D(F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4) +POW_SH_IMPL_2D(U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, int4) +POW_SH_IMPL_2D(U8_U8toF16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8, half4) +POW_SH_IMPL_2D(U8_U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4) + +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; +_viv_uniform VXC_512Bits uniExtractOddData_2x8; + +__kernel void pow_BF16_BF16toBF16 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); + + vxc_ushort8 src0, src1, dst, tmpData; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, x0, tmpData, 16); + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, x1, tmpData, 16); + + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, y0, tmpData, 16); + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, y1, tmpData, 16); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + _viv_asm(COPY, src0, tmpDst0, 16); + _viv_asm(COPY, src1, tmpDst1, 16); + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pow_BF16_BF16toBF16_2D + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __write_only image2d_array_t output + ) +{ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); + + vxc_ushort8 src0, src1, dst, tmpData; + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); + VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + float4 x0, x1; + float4 y0, y1; + float4 tmpDst0, tmpDst1; + + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, x0, tmpData, 16); + VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, x1, tmpData, 16); + + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); + _viv_asm(COPY, y0, tmpData, 16); + VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); + _viv_asm(COPY, y1, tmpData, 16); + + float4 s0 = sign(x0); + float4 s1 = sign(x1); + int4 t0 = convert_int4(y0) & 1; + int4 t1 = convert_int4(y1) & 1; + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; + tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); + tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); + + _viv_asm(COPY, src0, tmpDst0, 16); + _viv_asm(COPY, src1, tmpDst1, 16); + VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); +} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16.vx deleted file mode 100644 index 8180085..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16.vx +++ /dev/null @@ -1,338 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8; -_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2; - -_viv_uniform int input_ZP1; - -_viv_uniform float output_ZP; -_viv_uniform float outputScale; - -__kernel void pow_F16F16toF16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, src1; - vxc_short8 dst; - vxc_half8 data0, data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16F16toF16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, src1; - vxc_short8 dst; - vxc_half8 data0, data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16F16toU8( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, src1; - vxc_uchar8 dst; - vxc_half8 data0, data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16F16toU8_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, src1; - vxc_uchar8 dst; - vxc_half8 data0, data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16U8toF16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0; - vxc_uchar8 src1; - vxc_short8 dst; - vxc_half8 data0; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in1_zp; - _viv_asm(COPY, in1_zp, input_ZP1, 4); - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, dst, data0, 16); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16U8toF16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0; - vxc_uchar8 src1; - vxc_short8 dst; - vxc_half8 data0; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in1_zp; - _viv_asm(COPY, in1_zp, input_ZP1, 4); - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8); - _viv_asm(COPY, dst, data0, 16); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16U8toU8( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0; - vxc_uchar8 src1, dst; - vxc_half8 data0; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in1_zp; - _viv_asm(COPY, in1_zp, input_ZP1, 4); - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16U8toU8_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0; - vxc_uchar8 src1, dst; - vxc_half8 data0; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in1_zp; - _viv_asm(COPY, in1_zp, input_ZP1, 4); - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i16.vx deleted file mode 100644 index f877637..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i16.vx +++ /dev/null @@ -1,322 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; - -_viv_uniform float outScale_fl; - -__kernel void pow_F16F16toI16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, src1, dst; - vxc_half8 data0, data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16F16toI16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, src1, dst; - vxc_half8 data0, data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16I16toF16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, src1, dst; - vxc_half8 data0; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16I16toF16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, src1, dst; - vxc_half8 data0; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16I16toI16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, src1, dst; - vxc_half8 data0; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16I16toI16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, src1, dst; - vxc_half8 data0; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8; -_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8; -_viv_uniform VXC_512Bits uniExtractOddData_2x8; - -__kernel void pow_BF16BF16toBF16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_ushort8 src0, src1, dst, tmpData; - vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - - VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); - _viv_asm(COPY, x0, tmpData, 16); - VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); - _viv_asm(COPY, x1, tmpData, 16); - - VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); - _viv_asm(COPY, y0, tmpData, 16); - VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); - _viv_asm(COPY, y1, tmpData, 16); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - _viv_asm(COPY, src0, tmpDst0, 16); - _viv_asm(COPY, src1, tmpDst1, 16); - VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_BF16BF16toBF16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_ushort8 src0, src1, dst, tmpData; - vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - - VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); - _viv_asm(COPY, x0, tmpData, 16); - VXC_DP2x8(tmpData, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); - _viv_asm(COPY, x1, tmpData, 16); - - VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); - _viv_asm(COPY, y0, tmpData, 16); - VXC_DP2x8(tmpData, src1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); - _viv_asm(COPY, y1, tmpData, 16); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - _viv_asm(COPY, src0, tmpDst0, 16); - _viv_asm(COPY, src1, tmpDst1, 16); - VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i8.vx deleted file mode 100644 index 4b1e7fc..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_fp16_i8.vx +++ /dev/null @@ -1,239 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform float outScale_fl; - -__kernel void pow_F16F16toI8( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, src1; - vxc_char8 dst; - vxc_half8 data0, data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16F16toI8_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, src1; - vxc_char8 dst; - vxc_half8 data0, data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16I8toF16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, dst; - vxc_char8 src1; - vxc_half8 data0; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16I8toF16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, dst; - vxc_char8 src1; - vxc_half8 data0; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16I8toI8( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0; - vxc_char8 src1, dst; - vxc_half8 data0; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_F16I8toI8_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0; - vxc_char8 src1, dst; - vxc_half8 data0; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data0, src0, 16); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_i16.vx deleted file mode 100644 index f336106..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i16.vx +++ /dev/null @@ -1,227 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform float outScale_fl; - -__kernel void pow_I16F16toF16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, src1, dst; - vxc_half8 data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I16F16toF16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, src1, dst; - vxc_half8 data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I16F16toI16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, src1, dst; - vxc_half8 data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I16F16toI16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, src1, dst; - vxc_half8 data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I16I16toI16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_short8 src0, src1, dst; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I16I16toI16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_short8 src0, src1, dst; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_i8.vx deleted file mode 100644 index 89ecade..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_i8.vx +++ /dev/null @@ -1,231 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform float outScale_fl; - -__kernel void pow_I8F16toF16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_char8 src0; - vxc_short8 src1, dst; - vxc_half8 data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I8F16toF16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_char8 src0; - vxc_short8 src1, dst; - vxc_half8 data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I8F16toI8( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_char8 src0, dst; - vxc_short8 src1; - vxc_half8 data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I8F16toI8_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_char8 src0, dst; - vxc_short8 src1; - vxc_half8 data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I8I8toI8( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_char8 src0, src1, dst; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_I8I8toI8_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_char8 src0, src1, dst; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pow_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pow_u8.vx deleted file mode 100644 index 44e7ca3..0000000 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pow_u8.vx +++ /dev/null @@ -1,349 +0,0 @@ -#include "cl_viv_vx_ext.h" - -_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4; -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8; - -_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2; -_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2; - -_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8; - -_viv_uniform int input_ZP0; -_viv_uniform int input_ZP1; -_viv_uniform float output_ZP; -_viv_uniform float outputScale; - -__kernel void pow_U8F16toF16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_uchar8 src0; - vxc_short8 src1; - vxc_short8 dst; - vxc_half8 data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in0_zp; - _viv_asm(COPY, in0_zp, input_ZP0, 4); - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_U8F16toF16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_uchar8 src0; - vxc_short8 src1; - vxc_short8 dst; - vxc_half8 data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in0_zp; - _viv_asm(COPY, in0_zp, input_ZP0, 4); - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_U8F16toU8( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_uchar8 src0; - vxc_short8 src1; - vxc_uchar8 dst; - vxc_half8 data1; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in0_zp; - _viv_asm(COPY, in0_zp, input_ZP0, 4); - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_U8F16toU8_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_uchar8 src0; - vxc_short8 src1; - vxc_uchar8 dst; - vxc_half8 data1; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, data1, src1, 16); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in0_zp; - _viv_asm(COPY, in0_zp, input_ZP0, 4); - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2); - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); - - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_U8U8toU8( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_uchar8 src0, src1, dst; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in0_zp, in1_zp; - _viv_asm(COPY, in0_zp, input_ZP0, 4); - _viv_asm(COPY, in1_zp, input_ZP1, 4); - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_U8U8toU8_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_uchar8 src0, src1, dst; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in0_zp, in1_zp; - _viv_asm(COPY, in0_zp, input_ZP0, 4); - _viv_asm(COPY, in1_zp, input_ZP1, 4); - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2); - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP); - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP); - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\ - uniConvertInt32toUint8_2x8); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_U8U8toF16( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - vxc_uchar8 src0; - vxc_uchar8 src1; - vxc_short8 dst; - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in0_zp, in1_zp; - _viv_asm(COPY, in0_zp, input_ZP0, 4); - _viv_asm(COPY, in1_zp, input_ZP1, 4); - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - vxc_half8 tmpVal; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpVal, 16); - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} - -__kernel void pow_U8U8toF16_2D( - image2d_array_t input0, - image2d_array_t input1, - image2d_array_t output) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - vxc_uchar8 src0; - vxc_uchar8 src1; - vxc_short8 dst; - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - float4 x0, x1; - float4 y0, y1; - float4 tmpDst0, tmpDst1; - short in0_zp, in1_zp; - _viv_asm(COPY, in0_zp, input_ZP0, 4); - _viv_asm(COPY, in1_zp, input_ZP1, 4); - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4); - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4); - - float4 s0 = sign(x0); - float4 s1 = sign(x1); - int4 t0 = convert_int4(y0) & 1; - int4 t1 = convert_int4(y1) & 1; - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0))); - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1))); - - half4 tmpVal0, tmpVal1; - vxc_half8 tmpVal; - _viv_asm(CONV, tmpVal0, tmpDst0); - _viv_asm(CONV, tmpVal1, tmpDst1); - VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8); - _viv_asm(COPY, dst, tmpVal, 16); - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -} \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx index 602f6f5..5cb3ebb 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_0.vx @@ -11,20 +11,28 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8; _viv_uniform float output_scale; _viv_uniform float output_zp; -#define RESIZE_BILINEAR_4X1(input, mean, output) \ - VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ - \ - VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ +#define RESIZE_BILINEAR_4X1(mean, output) \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.w; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + coord_in.x = coord.x; \ \ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ uniVecShift10); \ @@ -52,9 +60,7 @@ _viv_uniform float output_zp; #define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ ( \ - __read_only image2d_array_t input0, \ - __read_only image2d_array_t input1, \ - __read_only image2d_array_t input2, \ + __read_only image2d_array_t input, \ __write_only image2d_array_t output0, \ __write_only image2d_array_t output1, \ __write_only image2d_array_t output2, \ @@ -96,23 +102,32 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ vxc_uchar16 line0Y; \ vxc_uchar16 line1Y; \ int4 coord; \ + int4 coord_in = (int4)(0, 0, 0, 0); \ sx = sx + *xOffset; \ - coord.xyz = sx.xyz; \ - coord.w = sy + *yOffset; \ - int2 coord1 = (int2)(sx.w, coord.w); \ - VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ - \ - VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + coord = sx.xyzw; \ + coord_in.y = sy + *yOffset; \ + coord_in.x = coord.x; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.w; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + coord_in.x = coord.x; \ \ int4 test01, temp1; \ int4 test02, temp2; \ @@ -151,8 +166,8 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ VXC_WriteImage(output0, coord_out, dst, \ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - RESIZE_BILINEAR_4X1(input1, gMean, output1) \ - RESIZE_BILINEAR_4X1(input2, bMean, output2) \ + RESIZE_BILINEAR_4X1(gMean, output1) \ + RESIZE_BILINEAR_4X1(bMean, output2) \ } PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8) PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8) @@ -160,9 +175,7 @@ PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8) #define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ ( \ - __read_only image2d_array_t input0, \ - __read_only image2d_array_t input1, \ - __read_only image2d_array_t input2, \ + __read_only image2d_array_t input, \ __write_only image2d_array_t output0, \ __write_only image2d_array_t output1, \ __write_only image2d_array_t output2, \ @@ -205,18 +218,25 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ coord.xyz = sx.xyz; \ coord.w = sy + *yOffset; \ int2 coord1 = (int2)(sx.w, coord.w); \ - VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ - \ - VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + int4 coord_in = (int4)(coord.xw, 0, 0); \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord1.x; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ \ int4 test01, temp1; \ @@ -252,18 +272,26 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ \ VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ - \ - VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + coord_in.x = coord.x; \ + coord_in.z = 1; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord1.x; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ \ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ @@ -289,18 +317,26 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \ \ VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ \ - VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ - \ - VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + coord_in.x = coord.x; \ + coord_in.z = 2; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.y; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord.z; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + coord_in.x = coord1.x; \ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \ VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ \ VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx index 5a9942c..b0714e4 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_1.vx @@ -1,12 +1,4 @@ -/* - ============================================================================ - Name : GrayScale.vx - Author : Sam - Version : - Copyright : Your copyright notice - Description : - ============================================================================ - */ + #include "cl_viv_vx_ext.h" _viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8; @@ -18,9 +10,7 @@ _viv_uniform float output_zp; #define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ ( \ - __read_only image2d_array_t input0, \ - __read_only image2d_array_t input1, \ - __read_only image2d_array_t input2, \ + __read_only image2d_array_t input, \ __write_only image2d_array_t output0, \ __write_only image2d_array_t output1, \ __write_only image2d_array_t output2, \ @@ -40,9 +30,12 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ vxc_uchar16 src0, src1, src2; \ dst_type dst0, dst1; \ \ - VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + int4 coord_in = (int4)(coord.xy, 0, 0); \ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ coord.x = coord.z + 8; \ float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \ @@ -90,9 +83,7 @@ PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8) #define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ ( \ - __read_only image2d_array_t input0, \ - __read_only image2d_array_t input1, \ - __read_only image2d_array_t input2, \ + __read_only image2d_array_t input, \ __write_only image2d_array_t output0, \ __write_only image2d_array_t output1, \ __write_only image2d_array_t output2, \ @@ -112,9 +103,12 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \ vxc_uchar16 src0, src1, src2; \ write_type dst; \ \ - VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ - VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + int4 coord_in = (int4)(coord.xy, 0, 0); \ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + coord_in.z ++; \ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ \ float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \ rMean * output_scale - output_zp, output_scale); \ diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx index a82a3ba..1ac60fe 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_2.vx @@ -8,9 +8,7 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4; __kernel void pre_process_rgb888_planar_4over3_U8toU8 ( - __read_only image2d_array_t input0, - __read_only image2d_array_t input1, - __read_only image2d_array_t input2, + __read_only image2d_array_t input, __write_only image2d_array_t output0, __write_only image2d_array_t output1, __write_only image2d_array_t output2, @@ -24,17 +22,21 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8 float f32Var ) { - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0); int4 coord_out; vxc_uchar16 src0, src1, src2, src3; vxc_uchar16 dst0, dst1, dst2; - VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.z ++; coord_out.xy = (coord_in.xy >> 2) * 3; coord_out.zw = coord_in.yy + (int2)(1, 2); @@ -51,10 +53,15 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8 VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.z ++; VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8); VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8); @@ -69,10 +76,14 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8 VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3), + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8); VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8); @@ -90,9 +101,7 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8 __kernel void pre_process_rgb888_planar_half_U8toU8 ( - __read_only image2d_array_t input0, - __read_only image2d_array_t input1, - __read_only image2d_array_t input2, + __read_only image2d_array_t input, __write_only image2d_array_t output0, __write_only image2d_array_t output1, __write_only image2d_array_t output2, @@ -106,17 +115,22 @@ __kernel void pre_process_rgb888_planar_half_U8toU8 float f32Var ) { - int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0); vxc_uchar16 src0, src1, src2; - VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage2DArray(src0, input, coord_in, 0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.z ++; + VXC_ReadImage2DArray(src1, input, coord_in, 0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + coord_in.z ++; + VXC_ReadImage2DArray(src2, input, coord_in, 0, + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - coord_in.zw = coord_in.xy >> 1; + int2 coord = coord_in.xy >> 1; - VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); - VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx new file mode 100644 index 0000000..107846e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_0.vx @@ -0,0 +1,330 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniVecShift10; +_viv_uniform VXC_512Bits uniAddRShift; +_viv_uniform VXC_512Bits uniGetTempVal; +_viv_uniform VXC_512Bits uniExtractBytes; + +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4; +_viv_uniform VXC_512Bits uniExtract8Data_2x8; + +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define RESIZE_BILINEAR_4X1(input, mean, output) \ + VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \ + _viv_asm(CONV, dst0, tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst1, 8); \ + VXC_WriteImage(output, coord_out, dst, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); + +#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output0, \ + __write_only image2d_array_t output1, \ + __write_only image2d_array_t output2, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var \ + ) \ +{ \ + int2 ratioXY = (int2)(*xRatio, *yRatio); \ + \ + int4 xPos = get_global_id(0); \ + int yPos = get_global_id(1); \ + \ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ + xPos += (int4)(0, 1, 2, 3); \ + \ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ + int4 sx = fx0 & 0xffff8000; \ + fx0 -= sx; \ + sx = sx >> 15; \ + \ + vxc_short4 fx; \ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniAddRShift); \ + \ + int fy = yPos * ratioXY.y + ratioSufXY.y; \ + int sy = fy & 0xffff8000; \ + \ + fy -= sy; \ + sy = sy >> 15; \ + \ + fy = (fy + (1<< 4)) >> 5; \ + \ + vxc_uchar16 line0Y; \ + vxc_uchar16 line1Y; \ + int4 coord; \ + sx = sx + *xOffset; \ + coord.xyz = sx.xyz; \ + coord.w = sy + *yOffset; \ + int2 coord1 = (int2)(sx.w, coord.w); \ + VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 test01, temp1; \ + int4 test02, temp2; \ + int4 tt; \ + vxc_uchar4 val; \ + int2 coord_out = (int2)(xPos.x, yPos); \ + \ + vxc_uchar8 line1, line2; \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + vxc_float4 tmp_dst; \ + vxc_uchar4 u8_dst; \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + conv_type dst0; \ + dst_type dst1; \ + copy_type dst; \ + tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \ + _viv_asm(CONV, dst0, tmp_dst); \ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + _viv_asm(COPY, dst, dst1, 8); \ + VXC_WriteImage(output0, coord_out, dst, \ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + RESIZE_BILINEAR_4X1(input1, gMean, output1) \ + RESIZE_BILINEAR_4X1(input2, bMean, output2) \ +} +RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8) +RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8) + +#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output0, \ + __write_only image2d_array_t output1, \ + __write_only image2d_array_t output2, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var \ + ) \ +{ \ + int2 ratioXY = (int2)(*xRatio, *yRatio); \ + int4 xPos = get_global_id(0); \ + int yPos = get_global_id(1); \ + \ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \ + xPos += (int4)(0, 1, 2, 3); \ + \ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \ + int4 sx = fx0 & 0xffff8000; \ + fx0 -= sx; \ + sx = sx >> 15; \ + \ + vxc_short4 fx; \ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \ + \ + int fy = yPos * ratioXY.y + ratioSufXY.y; \ + int sy = fy & 0xffff8000; \ + \ + fy -= sy; \ + sy = sy >> 15; \ + fy = (fy + (1<< 4)) >> 5; \ + \ + vxc_uchar16 line0Y; \ + vxc_uchar16 line1Y; \ + int4 coord; \ + sx = sx + *xOffset; \ + coord.xyz = sx.xyz; \ + coord.w = sy + *yOffset; \ + int2 coord1 = (int2)(sx.w, coord.w); \ + VXC_ReadImage(line0Y, input0, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input0, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input0, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input0, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + int4 test01, temp1; \ + int4 test02, temp2; \ + int2 coord_out = (int2)(xPos.x, yPos); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + \ + vxc_float4 tmp_dst; \ + vxc_uchar4 u8_dst; \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + \ + int4 dst0; \ + write_type dst; \ + tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + \ + VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line0Y, input1, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input1, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input1, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input1, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input1, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input1, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input1, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input1, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + \ + VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line0Y, input2, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input2, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input2, coord.zw, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line0Y, input2, coord1, 0, VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_ReadImage(line1Y, input2, coord.xw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input2, coord.yw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input2, coord.zw, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(line1Y, input2, coord1, VXC_5BITOFFSET_XY(0, 1), \ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp1 = temp1 + test01; \ + \ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniVecShift10); \ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \ + uniGetTempVal); \ + temp2 = temp2 + test02; \ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniExtractBytes); \ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \ + uniConvertIntergetoF32_4x4); \ + tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \ + dst0 = convert_int4_rte(tmp_dst); \ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \ + uniExtract8Data_2x8); \ + \ + VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \ +} +RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16) +RGB888_PLANAR_SEP_8BITS(I8, vxc_char16) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx new file mode 100644 index 0000000..ff55851 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_1.vx @@ -0,0 +1,143 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8; +_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8; + +_viv_uniform float output_scale; +_viv_uniform float output_zp; + +#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output0, \ + __write_only image2d_array_t output1, \ + __write_only image2d_array_t output2, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + coord.xy += (int2)(*xOffset, *yOffset); \ + vxc_uchar16 src0, src1, src2; \ + dst_type dst0, dst1; \ + \ + VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + coord.x = coord.z + 8; \ + float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \ + rMean * output_scale - output_zp, output_scale); \ + \ + half4 paramData_f16; \ + copy_type tmp_dst; \ + _viv_asm(CONV, paramData_f16, paramData0); \ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevLo_2x8); \ + VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevHi_2x8); \ + _viv_asm(COPY, tmp_dst, dst0, 16); \ + VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmp_dst, dst1, 16); \ + VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \ + gMean * output_scale - output_zp, output_scale); \ + _viv_asm(CONV, paramData_f16, paramData1); \ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevLo_2x8); \ + VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevHi_2x8); \ + _viv_asm(COPY, tmp_dst, dst0, 16); \ + VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmp_dst, dst1, 16); \ + VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \ + bMean * output_scale - output_zp, output_scale); \ + _viv_asm(CONV, paramData_f16, paramData2); \ + VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevLo_2x8); \ + VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \ + uniDataMeanStddevHi_2x8); \ + _viv_asm(COPY, tmp_dst, dst0, 16); \ + VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + _viv_asm(COPY, tmp_dst, dst1, 16); \ + VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ +} +RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8) +RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8) + +#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \ + ( \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __read_only image2d_array_t input2, \ + __write_only image2d_array_t output0, \ + __write_only image2d_array_t output1, \ + __write_only image2d_array_t output2, \ + global int *xRatio, \ + global int *yRatio, \ + global int *xOffset, \ + global int *yOffset, \ + float rMean, \ + float gMean, \ + float bMean, \ + float f32Var \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \ + \ + coord.xy += (int2) (*xOffset, *yOffset); \ + vxc_uchar16 src0, src1, src2; \ + write_type dst; \ + \ + VXC_ReadImage(src0, input0, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src1, input1, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + VXC_ReadImage(src2, input2, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \ + rMean * output_scale - output_zp, output_scale); \ + \ + half4 paramData_f16; \ + _viv_asm(CONV, paramData_f16, paramData0); \ + \ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevHi_2x8); \ + VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \ + gMean * output_scale - output_zp, output_scale); \ + _viv_asm(CONV, paramData_f16, paramData1); \ + \ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevHi_2x8); \ + VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ + \ + float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \ + bMean * output_scale - output_zp, output_scale); \ + _viv_asm(CONV, paramData_f16, paramData2); \ + \ + VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevLo_2x8); \ + VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \ + uniDataMeanStddevHi_2x8); \ + VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \ +} +PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16) +PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16) diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx new file mode 100644 index 0000000..bbfed6e --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb888_planar_sep_2.vx @@ -0,0 +1,122 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8; +_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8; +_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4; +_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4; +_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4; + +__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output0, + __write_only image2d_array_t output1, + __write_only image2d_array_t output2, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + int4 coord_out; + + vxc_uchar16 src0, src1, src2, src3; + vxc_uchar16 dst0, dst1, dst2; + + VXC_ReadImage(src0, input0, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input0, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src2, input0, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src3, input0, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_out.xy = (coord_in.xy >> 2) * 3; + coord_out.zw = coord_in.yy + (int2)(1, 2); + + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8); + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); + + VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(src0, input1, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src2, input1, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src3, input1, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8); + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); + + VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + + VXC_ReadImage(src0, input2, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input2, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src2, input2, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src3, input2, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8); + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4); + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4); + + VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); +} + +__kernel void pre_process_rgb888_planar_sep_half_U8toU8 + ( + __read_only image2d_array_t input0, + __read_only image2d_array_t input1, + __read_only image2d_array_t input2, + __write_only image2d_array_t output0, + __write_only image2d_array_t output1, + __write_only image2d_array_t output2, + global int *xRatio, + global int *yRatio, + global int *xOffset, + global int *yOffset, + float rMean, + float gMean, + float bMean, + float f32Var + ) +{ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1)); + + vxc_uchar16 src0, src1, src2; + + VXC_ReadImage(src0, input0, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src1, input1, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(src2, input2, coord_in.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_in.zw = coord_in.xy >> 1; + + VXC_WriteImage(output0, coord_in.zw, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc_bound.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc_bound.vx new file mode 100644 index 0000000..f0303f4 --- /dev/null +++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc_bound.vx @@ -0,0 +1,153 @@ +#include "cl_viv_vx_ext.h" + +_viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8; +_viv_uniform int2 x_coord; + +__kernel void resize_bilinear_nhwc_bound_U8toU8_2x + ( + __read_only image2d_array_t input, + image2d_array_t output, + __write_only image2d_array_t output1 + ) +{ + int4 coord_out = (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0)); + int2 coord_in = (int2)(1, get_global_id(0)); + coord_in.y = ((coord_out.y * 2 - 1) >> 2); + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y; + + vxc_uchar16 in0, in1, in2, in3, result; + + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_out.z = coord_out.y + 1; + + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8); + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8); + VXC_WriteImage(output, coord_out.xz, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + + coord_in.x = x_coord.x; + + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_out.x = x_coord.y; + + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8); + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8); + VXC_WriteImage(output, coord_out.xz, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l10_4x4; +__kernel void resize_bilinear_nhwc_bound_U8toU8_3x + ( + __read_only image2d_array_t input, + image2d_array_t output, + __write_only image2d_array_t output1 + ) +{ + int4 coord_out = (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0)); + int2 coord_in = (int2)(1, get_global_id(0)); + coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6; + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y; + + vxc_uchar16 in0, in1, in2, in3, result; + + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_out.zw = coord_out.yy + (int2)(1, 2); + + VXC_DP4x4(result, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4); + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xz, in1, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + VXC_DP4x4(result, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4); + VXC_WriteImage(output, coord_out.xw, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + + coord_in.x = x_coord.x; + + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_out.x = x_coord.y; + + VXC_DP4x4(result, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4); + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xz, in1, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); + VXC_DP4x4(result, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4); + VXC_WriteImage(output, coord_out.xw, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0)); +} + +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l00_4x8; +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l10_4x8; +__kernel void resize_bilinear_nhwc_bound_U8toU8_4x + ( + __read_only image2d_array_t input, + image2d_array_t output, + __write_only image2d_array_t output1 + ) +{ + int4 coord_out = (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0)); + int2 coord_in = (int2)(1, get_global_id(0)); + coord_in.y = (coord_out.y * 2 - 3) >> 3; + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y; + + vxc_uchar16 in0, in1, in2, in3, dst0, dst1; + + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_out.z = coord_out.y + 1; + + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8); + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_out.x += 2; + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_out.x -= 2; + + coord_out.zw = coord_out.zz + (int2)(1, 2); + VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8); + VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8); + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_out.x += 2; + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + + coord_in.x = x_coord.x; + + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); + + coord_out.x = x_coord.y; + coord_out.z = coord_out.y + 1; + + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8); + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8); + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_out.x -= 2; + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_out.x += 2; + + coord_out.zw = coord_out.zz + (int2)(1, 2); + VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8); + VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8); + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + coord_out.x -= 2; + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); +} diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/select.vx b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx index 8553903..ce788a4 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/select.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx @@ -1,24 +1,25 @@ #include "cl_viv_vx_ext.h" _viv_uniform VXC_512Bits uniConvConditiontoDst_2x8; -_viv_uniform VXC_512Bits uniConvIntIn0toDst_2x8; -_viv_uniform VXC_512Bits uniConvIntIn1toDst_2x8; -_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In0_2x8; -_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In1_2x8; -_viv_uniform int input0Zp; -_viv_uniform int input1Zp; -_viv_uniform int outputZP; -_viv_uniform VXC_512Bits uniU8AddZP_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8; +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8; +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp #define SELECT_INT(type_name, read_fun, write_fun) \ - type_name tmp, src0, src1, dst, value; \ + type_name src0, src1, dst, value; \ vxc_char8 value_tmp; \ - read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + vxc_ushort8 mp0, mp1; \ + _viv_asm(COPY, mp0, multAndoutZP0, 16); \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_DP2x8(src0, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn0toDst_2x8); \ - read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ + read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_DP2x8(src1, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn1toDst_2x8); \ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8MulAndPostShift0_Lo_2x8); \ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8MulAndPostShift1_Lo_2x8); \ read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ VXC_DP2x8(value, value_tmp, value_tmp,\ @@ -38,6 +39,7 @@ __kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name( \ } SELECT_INT_FUN(I8, I8, I8, vxc_char8) +SELECT_INT_FUN(I8, U8, U8, vxc_uchar8) SELECT_INT_FUN(I8, I16, I16, vxc_short8) #define SELECT_INT_FUN_2D(cond_name, src_name, dst_name, type_name) \ @@ -52,6 +54,7 @@ __kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name##_2D( \ } SELECT_INT_FUN_2D(I8, I8, I8, vxc_char8) +SELECT_INT_FUN_2D(I8, U8, U8, vxc_uchar8) SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8) #define SELECT_HALF(read_fun, write_fun) \ @@ -88,45 +91,109 @@ __kernel void select_I8_F16_F16toF16_2D( SELECT_HALF(VXC_ReadImage, VXC_WriteImage) } -#define SELECT_U8(read_fun, write_fun) \ - vxc_uchar8 tmp, src0, src1, dst; \ - vxc_char8 value; \ - vxc_half8 tmp1; \ - vxc_uchar16 input0_ZP, input1_ZP, output_ZP; \ - _viv_asm(COPY, input0_ZP, input0Zp, 4); \ - _viv_asm(COPY, input1_ZP, input1Zp, 4); \ - _viv_asm(COPY, output_ZP, outputZP, 4); \ - read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ +#define SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, read_fun, write_fun) \ + vxc_short8 src0, src1, dst, value; \ + vxc_half8 value0, value1; \ + src0_type r0; \ + src1_type r1; \ + copy0_type v0; \ + copy1_type v1; \ + vxc_char8 value_tmp; \ + vxc_ushort8 mp0, mp1; \ + _viv_asm(COPY, mp0, multAndoutZP0, 16); \ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \ + read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_DP2x8(tmp1, tmp, input0_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniU8SubZP_MulM_PStoF16In0_2x8); \ - VXC_DP2x8(src0, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \ - read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ + _viv_asm(COPY, v0, src0, 16); \ + read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ - VXC_DP2x8(tmp1, tmp, input1_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ - uniU8SubZP_MulM_PStoF16In1_2x8); \ - VXC_DP2x8(src1, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \ - read_fun(value, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ + _viv_asm(COPY, v1, src1, 16); \ + VXC_DP2x8(value0, v0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8MulAndPostShift0_Lo_2x8); \ + _viv_asm(COPY, src0, value0, 16); \ + VXC_DP2x8(value1, v1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ + uniU8MulAndPostShift1_Lo_2x8); \ + _viv_asm(COPY, src1, value1, 16); \ + read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(value, value_tmp, value_tmp,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \ dst = (value != 0 ? src0 : src1); \ write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); -__kernel void select_I8_U8_U8toU8( +#define SELECT_HYBRID_TOF16_FUN(name, src0_type, copy0_type, src1_type, copy1_type) \ +__kernel void select_##name( \ + __read_only image2d_array_t condition, \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \ + VXC_ReadImage2DArray, VXC_WriteImage2DArray) \ +} +SELECT_HYBRID_TOF16_FUN(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16) +SELECT_HYBRID_TOF16_FUN(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8) +SELECT_HYBRID_TOF16_FUN(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16) +SELECT_HYBRID_TOF16_FUN(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8) +SELECT_HYBRID_TOF16_FUN(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +SELECT_HYBRID_TOF16_FUN(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8) + +#define SELECT_HYBRID_TOF16_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type) \ +__kernel void select_##name( \ + __read_only image2d_array_t condition, \ + __read_only image2d_array_t input0, \ + __read_only image2d_array_t input1, \ + __write_only image2d_array_t output) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \ + VXC_ReadImage, VXC_WriteImage) \ +} +SELECT_HYBRID_TOF16_FUN_2D(I8_F16_U8toF16_2D, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16) +SELECT_HYBRID_TOF16_FUN_2D(I8_U8_F16toF16_2D, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8) +SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I8toF16_2D, vxc_short8, vxc_half8, vxc_char16, vxc_char16) +SELECT_HYBRID_TOF16_FUN_2D(I8_I8_F16toF16_2D, vxc_char16, vxc_char16, vxc_short8, vxc_half8) +SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I16toF16_2D, vxc_short8, vxc_half8, vxc_short8, vxc_short8) +SELECT_HYBRID_TOF16_FUN_2D(I8_I16_F16toF16_2D, vxc_short8, vxc_short8, vxc_short8, vxc_half8) + +#define SELECT_HALF_TO_QINT(read_fun, write_fun, dst_type) \ + vxc_short8 src0, src1, tmp_dst, value; \ + vxc_half8 data; \ + dst_type dst; \ + vxc_char8 value_tmp; \ + read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \ + VXC_DP2x8(value, value_tmp, value_tmp,\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \ + tmp_dst = (value != 0 ? src0 : src1); \ + _viv_asm(COPY, data, tmp_dst, 16); \ + vxc_ushort8 mp0; \ + _viv_asm(COPY, mp0, multAndoutZP0, 16); \ + VXC_DP2x8(dst, data, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + uniU8MulAndPostShift0_Lo_2x8); \ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); + +__kernel void select_I8_F16_F16toU8( __read_only image2d_array_t condition, __read_only image2d_array_t input0, __read_only image2d_array_t input1, __write_only image2d_array_t output) { int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - SELECT_U8(VXC_ReadImage2DArray, VXC_WriteImage2DArray) + SELECT_HALF_TO_QINT(VXC_ReadImage2DArray, VXC_WriteImage2DArray, vxc_uchar16) } -__kernel void select_I8_U8_U8toU8_2D( +__kernel void select_I8_F16_F16toU8_2D( __read_only image2d_array_t condition, __read_only image2d_array_t input0, __read_only image2d_array_t input1, __write_only image2d_array_t output) { int2 coord = (int2)(get_global_id(0), get_global_id(1)); - SELECT_U8(VXC_ReadImage, VXC_WriteImage) + SELECT_HALF_TO_QINT(VXC_ReadImage, VXC_WriteImage, vxc_uchar16) } diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx b/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx index 5717266..ff07885 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx @@ -174,7 +174,7 @@ __kernel void slice_##name0##_I32to##name1##_2D \ SLICE_8BITSTO8BITS_2D(I8, I8, vxc_char16, vxc_char16) SLICE_8BITSTO8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16) -#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type) \ +#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type, save_type) \ __kernel void slice_##name0##_I32to##name1 \ ( \ __read_only image2d_array_t input0, \ @@ -186,7 +186,7 @@ __kernel void slice_##name0##_I32to##name1 \ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ src_type src; \ copy_type src0; \ - dst_type dst; \ + dst_type result; \ int4 coord_in; \ Image begin_img = create_image_from_image2d(input1, 4); \ uchar* begin_ptr = begin_img.ptr; \ @@ -198,15 +198,19 @@ __kernel void slice_##name0##_I32to##name1 \ \ vxc_ushort8 multiplier; \ _viv_asm(COPY, multiplier, multAndoutZP, 16); \ - VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + VXC_DP2x8(result, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ uniU8MulAndPostShift_Lo_2x8); \ + save_type dst; \ + _viv_asm(COPY, dst, result, 16); \ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ } -SLICE_16BITS_TO(F16, I8, vxc_half8, vxc_short8, vxc_char16) -SLICE_16BITS_TO(F16, U8, vxc_half8, vxc_short8, vxc_uchar16) -SLICE_16BITS_TO(F16, I16, vxc_half8, vxc_short8, vxc_short8) +SLICE_16BITS_TO(F16, I8, vxc_half8, vxc_short8, vxc_char16, vxc_char16) +SLICE_16BITS_TO(F16, U8, vxc_half8, vxc_short8, vxc_uchar16, vxc_uchar16) +SLICE_16BITS_TO(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8) +SLICE_16BITS_TO(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8) +SLICE_16BITS_TO(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8) -#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type) \ +#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type, save_type) \ __kernel void slice_##name0##_I32to##name1##_2D \ ( \ __read_only image2d_array_t input0, \ @@ -218,7 +222,7 @@ __kernel void slice_##name0##_I32to##name1##_2D \ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ src_type src; \ copy_type src0; \ - dst_type dst; \ + dst_type result; \ int2 coord_in; \ Image begin_img = create_image_from_image2d(input1, 4); \ uchar* begin_ptr = begin_img.ptr; \ @@ -230,10 +234,14 @@ __kernel void slice_##name0##_I32to##name1##_2D \ \ vxc_ushort8 multiplier; \ _viv_asm(COPY, multiplier, multAndoutZP, 16); \ - VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ + VXC_DP2x8(result, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \ uniU8MulAndPostShift_Lo_2x8); \ + save_type dst; \ + _viv_asm(COPY, dst, result, 16); \ VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \ } -SLICE_16BITS_TO_2D(F16, I8, vxc_half8, vxc_short8, vxc_char16) -SLICE_16BITS_TO_2D(F16, U8, vxc_half8, vxc_short8, vxc_uchar16) -SLICE_16BITS_TO_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8) \ No newline at end of file +SLICE_16BITS_TO_2D(F16, I8, vxc_half8, vxc_short8, vxc_char16, vxc_char16) +SLICE_16BITS_TO_2D(F16, U8, vxc_half8, vxc_short8, vxc_uchar16, vxc_uchar16) +SLICE_16BITS_TO_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8) +SLICE_16BITS_TO_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8) +SLICE_16BITS_TO_2D(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8) diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 0dd28ed..2aedbce 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -2967,6 +2967,846 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\ \n\ "; /* end of conv1d_ovxlib_k1024_vx*/ +static const char cumsum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\ +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ +_viv_uniform int input_zp;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float in_out_zp_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +__kernel void cumsum_F16toF16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_AXIS2(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_##in_name##to##out_name##_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ + \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_8BITS_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_I16toI16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ +\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_F16toF16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_AXIS1(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_##in_name##to##out_name##_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_8BITS_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_AXIS1(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_I16toI16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_F16toF16_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, tmpsum, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + for(; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_QINT_AXIS0(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_##in_name##to##out_name##_axis0( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + vxc_short8 rowSum; \\\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\ + short zp = (short)input_zp; \\\n\ + \\\n\ + for(; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \\\n\ + \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +\n\ +CUMSUM_QINT_AXIS0(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_QINT_AXIS0(I8, I8, vxc_char16, vxc_char16)\n\ +CUMSUM_QINT_AXIS0(I16, I16, vxc_short8, vxc_short8)\n\ +"; /* end of cumsum_vx*/ + +static const char cumsum_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\ +_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int input_zp;\n\ +_viv_uniform float in_out_scale;\n\ +_viv_uniform float in_out_zp_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +__kernel void cumsum_F16toF16_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ +\n\ + for(; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_8BITS_AXIS1_2D(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_##in_name##to##out_name##_axis1_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + int4 sum0 = (int4)(0); \\\n\ + int4 sum1 = (int4)(0); \\\n\ + int4 sum2 = (int4)(0); \\\n\ + int4 sum3 = (int4)(0); \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32B_4x4); \\\n\ + VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32C_4x4); \\\n\ + VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32D_4x4); \\\n\ + \\\n\ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\ + float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\ + int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +\n\ +CUMSUM_8BITS_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_8BITS_AXIS1_2D(I8, I8, vxc_char16, vxc_char16)\n\ +\n\ +__kernel void cumsum_I16toI16_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src, dst;\n\ + int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32A_4x4);\n\ + VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertU8toI32B_4x4);\n\ + float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\ + uniConvertInt32toUint8_2x8);\n\ +\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_F16toF16_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_short8 src, dst;\n\ + vxc_half8 data, tmpsum, sum;\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\ + for(; coord.x < width; coord.x += 8)\n\ + {\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16A_4x4);\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16B_4x4);\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16C_2x8);\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumHorzF16toF16_2x8);\n\ + _viv_asm(COPY, dst, sum, 16);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +#define CUMSUM_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_##in_name##to##out_name##_axis0_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + src_type src; \\\n\ + dst_type dst; \\\n\ + vxc_short8 rowSum; \\\n\ + int4 sum0, sum1; \\\n\ + sum0 ^= sum0; \\\n\ + sum1 ^= sum1; \\\n\ + short zp = (short)input_zp; \\\n\ + \\\n\ + for(; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzU8toI16A_4x4); \\\n\ + VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzU8toI16B_8x4); \\\n\ + VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSubZpI16toI16_2x8); \\\n\ + VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumHorzI16toI32A_4x4); \\\n\ + VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumHorzI16toI32B_4x4); \\\n\ + \\\n\ + float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\ + float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\ + int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\ + int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\ + \\\n\ + VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +\n\ +CUMSUM_QINT_AXIS0_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\ +CUMSUM_QINT_AXIS0_2D(I8, I8, vxc_char16, vxc_char16)\n\ +CUMSUM_QINT_AXIS0_2D(I16, I16, vxc_short8, vxc_short8)\n\ +"; /* end of cumsum_2d_vx*/ + +static const char cumsum_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ +\n\ +__kernel void cumsum_BF16toBF16_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\ +\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ +\n\ + sum0 += data0;\n\ + sum1 += data1;\n\ + _viv_asm(COPY, dst0, sum0, 16);\n\ + _viv_asm(COPY, dst1, sum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_BF16toBF16_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\ +\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ + sum0 += data0;\n\ + sum1 += data1;\n\ + _viv_asm(COPY, dst0, sum0, 16);\n\ + _viv_asm(COPY, dst1, sum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_BF16toBF16_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ +\n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float preSum = 0;\n\ + float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ + float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\ +\n\ + for(; coord.x < width; coord.x += 8)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ +\n\ + float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));\n\ + float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));\n\ + tmpSum1 += tmpSum0.w;\n\ +\n\ + tmpSum0 += preSum;\n\ + tmpSum1 += preSum;\n\ +\n\ + preSum = tmpSum1.w;\n\ +\n\ + _viv_asm(COPY, dst0, tmpSum0, 16);\n\ + _viv_asm(COPY, dst1, tmpSum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_BF16toBF16_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), 0);\n\ +\n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\ +\n\ + for(; coord.y < height; coord.y++)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ +\n\ + sum0 += data0;\n\ + sum1 += data1;\n\ +\n\ + _viv_asm(COPY, dst0, sum0, 16);\n\ + _viv_asm(COPY, dst1, sum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_BF16toBF16_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis, int exclusive, int rev\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ +\n\ + vxc_ushort8 src, val0, val1;\n\ + vxc_ushort8 dst0, dst1, dst;\n\ + vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ + float preSum = 0;\n\ + float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ + float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\ +\n\ + for(; coord.x < width; coord.x += 8)\n\ + {\n\ + float4 data0, data1;\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part0_2x8);\n\ + VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniConvBF16toF32_Part1_2x8);\n\ + _viv_asm(COPY, data0, val0, 16);\n\ + _viv_asm(COPY, data1, val1, 16);\n\ +\n\ + float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));\n\ + float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));\n\ + tmpSum1 += tmpSum0.w;\n\ +\n\ + tmpSum0 += preSum;\n\ + tmpSum1 += preSum;\n\ +\n\ + preSum = tmpSum1.w;\n\ +\n\ + _viv_asm(COPY, dst0, tmpSum0, 16);\n\ + _viv_asm(COPY, dst1, tmpSum1, 16);\n\ + VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\ + uniExtractOddData_2x8);\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +"; /* end of cumsum_bf16_vx*/ + +static const char cumsum_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\ +_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\ +_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform int channel;\n\ +\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS2(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis2( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(coord.z = 0; coord.z < channel; coord.z++) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS2(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS2(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS1(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis1( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS1(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS1(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS0(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis0( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, tmpsum, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \\\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \\\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS0(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS0(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis1_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), 0); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumVertF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS1_2D(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS1_2D(U8, vxc_half8, vxc_uchar16)\n\ +\n\ +#define CUMSUM_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type) \\\n\ +__kernel void cumsum_F16to##out_name##_axis0_2D( \\\n\ + __read_only image2d_t input, \\\n\ + __write_only image2d_t output, \\\n\ + int axis, int exclusive, int rev \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + vxc_short8 src; \\\n\ + dst_type dst; \\\n\ + vxc_half8 data, tmpsum, sum; \\\n\ + VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\ + vxc_ushort8 ms0; \\\n\ + _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\ + for(; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data, src, 16); \\\n\ + \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16A_4x4); \\\n\ + VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16B_4x4); \\\n\ + VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniSumHorzF16toF16C_2x8); \\\n\ + VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniAccSumHorzF16toF16_2x8); \\\n\ + VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift_0_Lo_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +CUMSUM_F16TOQINT_AXIS0_2D(I8, vxc_half8, vxc_char16)\n\ +CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)\n\ +CUMSUM_F16TOQINT_AXIS0_2D(U8, vxc_half8, vxc_uchar16)\n\ +"; /* end of cumsum_f16_u8_vx*/ + static const char custom_softmax_vx[] = "/*\n\ ============================================================================\n\ Name : Softmax2.vx\n\ @@ -5237,6 +6077,22 @@ float4 eltwise_unary_celu(float4 val)\n\ return val < 0 ? x : val;\n\ }\n\ \n\ +float4 eltwise_unary_rcp(float4 val)\n\ +{\n\ + return 1.0f / val;\n\ +}\n\ +\n\ +float4 eltwise_unary_sign(float4 val)\n\ +{\n\ + return sign(val);\n\ +}\n\ +\n\ +float4 eltwise_unary_softsign(float4 val)\n\ +{\n\ + float4 _rcp = 1.0f / (1.0f + fabs(val));\n\ + return val * _rcp;\n\ +}\n\ +\n\ _viv_uniform float inputScale;\n\ _viv_uniform float inputTail;\n\ _viv_uniform float outputScale;\n\ @@ -5281,83 +6137,6 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\ _viv_asm(COPY, dst, dst2, 16); \\\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -//EXP\n\ -ELTSISE_UNARY_2D(exp, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(exp, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(exp, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(exp, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(exp, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(exp, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(exp, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(exp, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(exp, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//SIN\n\ -ELTSISE_UNARY_2D(sin, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(sin, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(sin, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(sin, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(sin, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(sin, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//COS\n\ -ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//LOG\n\ -ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(log, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(log, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(log, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(log, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(log, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(log, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(log, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//SELU\n\ -ELTSISE_UNARY_2D(selu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(selu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(selu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(selu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(selu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(selu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(selu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(selu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(selu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//NEG\n\ -ELTSISE_UNARY_2D(neg, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(neg, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(neg, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(neg, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(neg, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(neg, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(neg, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(neg, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(neg, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//CELU\n\ -ELTSISE_UNARY_2D(celu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(celu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(celu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(celu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(celu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_2D(celu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(celu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_2D(celu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_2D(celu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_2D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ \n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ @@ -5392,21 +6171,39 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ +#define ADD_ELTSISE_UNARY_2D(func_name) \\\n\ +ELTSISE_UNARY_2D(func_name, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) \\\n\ +ELTSISE_UNARY_2D(func_name, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) \\\n\ +ELTSISE_UNARY_2D(func_name, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) \\\n\ +ELTSISE_UNARY_2D(func_name, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) \\\n\ +ELTSISE_UNARY_2D(func_name, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) \\\n\ +ELTSISE_UNARY_2D(func_name, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) \\\n\ +ELTSISE_UNARY_2D(func_name, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) \\\n\ +ELTSISE_UNARY_2D(func_name, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) \\\n\ +ELTSISE_UNARY_2D(func_name, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) \\\n\ +ELTSISE_UNARY_2D(func_name, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) \\\n\ +ELTSISE_UNARY_BF16_2D(func_name)\n\ +\n\ //EXP\n\ -ELTSISE_UNARY_BF16_2D(exp)\n\ +ADD_ELTSISE_UNARY_2D(exp)\n\ //SIN\n\ -ELTSISE_UNARY_BF16_2D(sin)\n\ +ADD_ELTSISE_UNARY_2D(sin)\n\ //COS\n\ -ELTSISE_UNARY_BF16_2D(cos)\n\ +ADD_ELTSISE_UNARY_2D(cos)\n\ //LOG\n\ -ELTSISE_UNARY_BF16_2D(log)\n\ +ADD_ELTSISE_UNARY_2D(log)\n\ //SELU\n\ -ELTSISE_UNARY_BF16_2D(selu)\n\ +ADD_ELTSISE_UNARY_2D(selu)\n\ //NEG\n\ -ELTSISE_UNARY_BF16_2D(neg)\n\ +ADD_ELTSISE_UNARY_2D(neg)\n\ //CELU\n\ -ELTSISE_UNARY_BF16_2D(celu)\n\ -"; /* end of eltwise_unary_2d_1_vx*/ +ADD_ELTSISE_UNARY_2D(celu)\n\ +//RCP\n\ +ADD_ELTSISE_UNARY_2D(rcp)\n\ +//SIGN\n\ +ADD_ELTSISE_UNARY_2D(sign)\n\ +//SOFTSIGN\n\ +ADD_ELTSISE_UNARY_2D(softsign)"; /* end of eltwise_unary_2d_1_vx*/ static const char eltwise_unary_3d_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -5706,6 +6503,22 @@ float4 eltwise_unary_celu(float4 val)\n\ return val < 0 ? x : val;\n\ }\n\ \n\ +float4 eltwise_unary_rcp(float4 val)\n\ +{\n\ + return 1.0f / val;\n\ +}\n\ +\n\ +float4 eltwise_unary_sign(float4 val)\n\ +{\n\ + return sign(val);\n\ +}\n\ +\n\ +float4 eltwise_unary_softsign(float4 val)\n\ +{\n\ + float4 _rcp = 1.0f / (1.0f + fabs(val));\n\ + return val * _rcp;\n\ +}\n\ +\n\ _viv_uniform float inputScale;\n\ _viv_uniform float inputTail;\n\ _viv_uniform float outputScale;\n\ @@ -5750,83 +6563,6 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\ _viv_asm(COPY, dst, dst2, 16); \\\n\ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -//EXP\n\ -ELTSISE_UNARY_3D(exp, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(exp, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(exp, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(exp, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(exp, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(exp, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(exp, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(exp, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(exp, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(exp, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//SIN\n\ -ELTSISE_UNARY_3D(sin, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(sin, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(sin, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(sin, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(sin, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(sin, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//COS\n\ -ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//LOG\n\ -ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(log, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(log, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(log, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(log, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(log, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(log, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(log, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(log, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//SELU\n\ -ELTSISE_UNARY_3D(selu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(selu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(selu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(selu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(selu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(selu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(selu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(selu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(selu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(selu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//NEG\n\ -ELTSISE_UNARY_3D(neg, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(neg, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(neg, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(neg, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(neg, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(neg, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(neg, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(neg, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(neg, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(neg, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ -//CELU\n\ -ELTSISE_UNARY_3D(celu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(celu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(celu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(celu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(celu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)\n\ -ELTSISE_UNARY_3D(celu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(celu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)\n\ -ELTSISE_UNARY_3D(celu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)\n\ -ELTSISE_UNARY_3D(celu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)\n\ -ELTSISE_UNARY_3D(celu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)\n\ \n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ @@ -5860,20 +6596,39 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ +#define ADD_ELTSISE_UNARY_3D(func_name) \\\n\ +ELTSISE_UNARY_3D(func_name, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8) \\\n\ +ELTSISE_UNARY_3D(func_name, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8) \\\n\ +ELTSISE_UNARY_3D(func_name, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8) \\\n\ +ELTSISE_UNARY_3D(func_name, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8) \\\n\ +ELTSISE_UNARY_3D(func_name, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8) \\\n\ +ELTSISE_UNARY_3D(func_name, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8) \\\n\ +ELTSISE_UNARY_3D(func_name, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8) \\\n\ +ELTSISE_UNARY_3D(func_name, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8) \\\n\ +ELTSISE_UNARY_3D(func_name, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8) \\\n\ +ELTSISE_UNARY_3D(func_name, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8) \\\n\ +ELTSISE_UNARY_BF16(func_name)\n\ +\n\ //EXP\n\ -ELTSISE_UNARY_BF16(exp)\n\ +ADD_ELTSISE_UNARY_3D(exp)\n\ //SIN\n\ -ELTSISE_UNARY_BF16(sin)\n\ +ADD_ELTSISE_UNARY_3D(sin)\n\ //COS\n\ -ELTSISE_UNARY_BF16(cos)\n\ +ADD_ELTSISE_UNARY_3D(cos)\n\ //LOG\n\ -ELTSISE_UNARY_BF16(log)\n\ +ADD_ELTSISE_UNARY_3D(log)\n\ //SELU\n\ -ELTSISE_UNARY_BF16(selu)\n\ +ADD_ELTSISE_UNARY_3D(selu)\n\ //NEG\n\ -ELTSISE_UNARY_BF16(neg)\n\ +ADD_ELTSISE_UNARY_3D(neg)\n\ //CELU\n\ -ELTSISE_UNARY_BF16(selu)\n\ +ADD_ELTSISE_UNARY_3D(celu)\n\ +//RCP\n\ +ADD_ELTSISE_UNARY_3D(rcp)\n\ +//SIGN\n\ +ADD_ELTSISE_UNARY_3D(sign)\n\ +//SOFTSIGN\n\ +ADD_ELTSISE_UNARY_3D(softsign)\n\ "; /* end of eltwise_unary_3d_1_vx*/ static const char erf_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -8234,1661 +8989,139 @@ __kernel void get_matrix_F16toF32\n\ }\n\ "; /* end of get_matrix_vx*/ -static const char group_normalization_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char group_normalization_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ _viv_uniform int height;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ \n\ -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ -\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform int output_ZP;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16(\n\ - image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - vxc_float4 sumsqr;\n\ - vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - tmpSumSqr += sumsqr;\n\ - }\n\ - }\n\ -\n\ - lcl_sum[lidx] = tmpSumSqr.x;\n\ - lcl_sqr[lidx] = tmpSumSqr.y;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - float sum = 0;\n\ - float sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16_2D(\n\ - image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ -\n\ - int2 coord = (int2)(gidx, get_global_id(1));\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - vxc_float4 sumsqr = (vxc_float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - if(gidx < width)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - }\n\ -\n\ - lcl_sum[lidx] = sumsqr.x;\n\ - lcl_sqr[lidx] = sumsqr.y;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - float sum = 0;\n\ - float sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ - float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ -\n\ - vxc_float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ - float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ - vxc_float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ - float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_uchar16 outval;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - float alpha = outputScale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ -\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ - float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_uchar16 outval;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - float alpha = outputScale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of group_normalization_f16_vx*/ - -static const char group_normalization_f16_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ -\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform int output_ZP;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ - float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ -\n\ - vxc_float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ - float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ - vxc_float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ - float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_uchar16 outval;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - float alpha = outputScale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ -\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\ - float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_uchar16 outval;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - float alpha = outputScale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of group_normalization_f16_scale_vx*/ - -static const char group_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -\n\ -_viv_uniform float inFlScale_s2;\n\ -_viv_uniform float input_fl_scale;\n\ -_viv_uniform float inOut_fl_scale;\n\ -_viv_uniform float output_fl_scale;\n\ -\n\ -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ - vxc_short8 src0;\n\ - float sum = 0, sqr = 0;\n\ - vxc_float4 sumsqr = (vxc_float4)(0);\n\ - vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - //tmpSumSqr += sumsqr;\n\ - tmpSumSqr.x += sumsqr.x;\n\ - sqr += (sumsqr.y * inFlScale_s2);\n\ - }\n\ - sum = tmpSumSqr.x * input_fl_scale;\n\ - //sqr = tmpSumSqr.y * inFlScale_s2;\n\ - }\n\ -\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ -\n\ - int2 coord = (int2)(gidx, gidz);\n\ - vxc_short8 src0;\n\ - float sum = 0, sqr = 0;\n\ - vxc_float4 sumsqr = (vxc_float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - if(gidx < width)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - sqr = sumsqr.y * inFlScale_s2;\n\ - sum = sumsqr.x * input_fl_scale;\n\ - //sqr = tmpSumSqr.y * inFlScale_s2;\n\ - }\n\ -\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16(\n\ - image2d_array_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D,\n\ - float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ -\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16_2D(\n\ - image2d_array_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D,\n\ - float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16(\n\ - image2d_array_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D,\n\ - float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_short8 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toInt16_2x8);\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16_2D(\n\ - image2d_array_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D,\n\ - float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_short8 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toInt16_2x8);\n\ - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of group_normalization_i16_vx*/ - -static const char group_normalization_i16_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -\n\ -_viv_uniform float input_fl_scale;\n\ -_viv_uniform float inOut_fl_scale;\n\ -_viv_uniform float output_fl_scale;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\ -\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16(\n\ - image2d_array_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D,\n\ - float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ -\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16_2D(\n\ - image2d_array_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D,\n\ - float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_short8 src0;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16(\n\ - image2d_array_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D,\n\ - float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_short8 src0, src2;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toInt16_2x8);\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16_2D(\n\ - image2d_array_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int is2D,\n\ - float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_short8 src0, src2;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toInt16_2x8);\n\ - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of group_normalization_i16_scale_vx*/ - -static const char group_normalization_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniSumInt8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;\n\ -_viv_uniform float inFlScale_s2;\n\ -_viv_uniform float input_fl_scale;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\ -\n\ -_viv_uniform float inOut_fl_scale;\n\ -_viv_uniform float output_fl_scale;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8(\n\ - image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ - vxc_char16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\ - tmpSum += (tmpSum1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\ - tmpSqr += (tmpSqr1);\n\ - }\n\ - sqr = tmpSqr * inFlScale_s2;\n\ - sum = tmpSum * input_fl_scale;\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8_2D(\n\ - image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ -\n\ - int2 coord = (int2)(gidx, gidz);\n\ - vxc_char16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum1, tmpSqr1;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - if(gidx < width)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\ - sqr = tmpSqr1 * inFlScale_s2;\n\ - sum = tmpSum1 * input_fl_scale;\n\ - }\n\ -\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_char16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ -\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_char16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_char16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_char16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of group_normalization_i8_vx*/ - -static const char group_normalization_i8_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform float input_fl_scale;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\ -\n\ -_viv_uniform float inOut_fl_scale;\n\ -_viv_uniform float output_fl_scale;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_char16 src0;\n\ - vxc_short8 outval;\n\ - vxc_half8 dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ -\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_char16 src0;\n\ - vxc_short8 outval;\n\ - vxc_half8 dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_char16 src0, src2;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_char16 src0, src2;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -"; /* end of group_normalization_i8_scale_vx*/ - -static const char group_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ _viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform float rowSumScale;\n\ -_viv_uniform float scale_inOut;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform int output_ZP;\n\ +_viv_uniform float input_scale2;\n\ +_viv_uniform float input_zp;\n\ +_viv_uniform float sum_x_tail;\n\ +_viv_uniform float sum_x2_tail0;\n\ +_viv_uniform float sum_x2_tail1;\n\ \n\ +_viv_uniform VXC_512Bits uniSumX_16x1;\n\ +_viv_uniform VXC_512Bits uniSumX2_16x1;\n\ _viv_uniform VXC_512Bits uniResetFp32_4x4;\n\ _viv_uniform int group_stride;\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8(\n\ - image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ - vxc_uchar16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ - }\n\ - sqr += (tmpSqr * e2InScale + rowSumScale);\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ +#define GROUP_NORM_SUMS_8BITS_IMPL(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, 0, gidz, 0); \\\n\ + src_type src0; \\\n\ + float2 sums_f32 = 0; \\\n\ + int2 sums = 0, sum_x_x2; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + if (gidx < width) \\\n\ + { \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + sums = sums + sum_x_x2; \\\n\ + } \\\n\ + sums_f32 = convert_float2(sums); \\\n\ + sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \\\n\ + sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \\\n\ + } \\\n\ + lcl_sum[lidx] = sums_f32.x; \\\n\ + lcl_sqr[lidx] = sums_f32.y; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + float sum_x = 0,sum_x2 = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum_x += dot(tmp_sum[i], one); \\\n\ + sum_x2 += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 data = (float4)(sum_x, sum_x2, 0, 0); \\\n\ + write_imagef(output, coord_out, data); \\\n\ + } \\\n\ }\n\ +GROUP_NORM_SUMS_8BITS_IMPL(U8, vxc_uchar16)\n\ +GROUP_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8_2D(\n\ - image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ -\n\ - int2 coord = (int2)(gidx, get_global_id(1));\n\ - vxc_uchar16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSqr, tmpSum1, tmpSqr1;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ - if(gidx < width)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr = tmpSqr1 + tmpZp1 * tmpSum1;\n\ - sqr = (tmpSqr * e2InScale + rowSumScale);\n\ - sum = (tmpSum1 + sumInZp) * input_scale;\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ +#define GROUP_NORM_SUMS_8BITS_IMPL_2D(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + \\\n\ + int2 coord = (int2)(gidx, get_global_id(1)); \\\n\ + src_type src0; \\\n\ + float2 sums = 0; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + if(gidx < width) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP16x1(sums, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sums, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + sums.y = sums.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums.x; \\\n\ + sums.x = sums.x * input_scale + sum_x_tail; \\\n\ + } \\\n\ + lcl_sum[lidx] = sums.x; \\\n\ + lcl_sqr[lidx] = sums.y; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + float sum_x = 0,sum_x2 = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum_x += dot(tmp_sum[i], one); \\\n\ + sum_x2 += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 data = (float4)(sum_x, sum_x2, 0, 0); \\\n\ + write_imagef(output, coord_out, data); \\\n\ + } \\\n\ }\n\ +GROUP_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)\n\ +GROUP_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvari(\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_means(\n\ image2d_t input, image2d_t output, float eps, float group_ratio)\n\ {\n\ int gidx = get_global_id(0);\n\ int lidx = get_local_id(0);\n\ \n\ int2 coord = (int2)(gidx, get_global_id(1));\n\ - vxc_uchar16 src0;\n\ + vxc_uchar16 src0 = 1;\n\ float2 sum_sqr = (float2)(0);\n\ - vxc_float4 mean_vari;\n\ + float4 mean_vari;\n\ VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4);\n\ \n\ __local float2 lcl_data[16];\n\ @@ -9925,388 +9158,792 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvar }\n\ }\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_uchar16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\ +#define CONVERT_INPUT_TO_F32() \\\n\ +VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ +VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ +VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ +VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4);\n\ \n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = scale_inOut * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +#define GROUP_NORM_8BITS_IMPL(name, src_type, dst_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_array_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + src_type src0; \\\n\ + dst_type dst; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f; \\\n\ + \\\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + vxc_int4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + \\\n\ + CONVERT_INPUT_TO_F32() \\\n\ + norm = tmpData0 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData1 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + norm = tmpData2 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData3 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ +GROUP_NORM_8BITS_IMPL(U8_F16toU8, vxc_uchar16, vxc_uchar16)\n\ +GROUP_NORM_8BITS_IMPL(I8_F16toI8, vxc_char16, vxc_char16)\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_uchar16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = scale_inOut * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +#define GROUP_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_array_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int2 coord = (int2)(get_global_id(0), gidz); \\\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\ + src_type src0; \\\n\ + dst_type dst; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f; \\\n\ + \\\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + vxc_int4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + \\\n\ + CONVERT_INPUT_TO_F32() \\\n\ + norm = tmpData0 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData1 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + norm = tmpData2 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData3 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ +GROUP_NORM_8BITS_IMPL_2D(U8_F16toU8, vxc_uchar16, vxc_uchar16)\n\ +GROUP_NORM_8BITS_IMPL_2D(I8_F16toI8, vxc_char16, vxc_char16)\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_uchar16 src0, src2;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = scale_inOut * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +#define GROUP_NORM_8BITS_F32_IMPL(name, src_type, dst_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + src_type src0; \\\n\ + dst_type dst; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f; \\\n\ + \\\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + vxc_int4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + \\\n\ + CONVERT_INPUT_TO_F32() \\\n\ + norm = tmpData0 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData1 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + norm = tmpData2 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData3 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ +GROUP_NORM_8BITS_F32_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\ +GROUP_NORM_8BITS_F32_IMPL(I8_F32toI8, vxc_char16, vxc_char16)\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_uchar16 src0, src2;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = scale_inOut * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of group_normalization_u8_vx*/ +#define GROUP_NORM_8BITS_F32_IMPL_2D(name, src_type, dst_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int2 coord = (int2)(get_global_id(0), gidz); \\\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\ + src_type src0; \\\n\ + dst_type dst; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f; \\\n\ + \\\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + vxc_int4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + \\\n\ + CONVERT_INPUT_TO_F32() \\\n\ + norm = tmpData0 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData1 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + norm = tmpData2 * alpha + bias_val; \\\n\ + tmpVal0 = convert_int4_rte(norm); \\\n\ + norm = tmpData3 * alpha + bias_val; \\\n\ + tmpVal1 = convert_int4_rte(norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GROUP_NORM_8BITS_F32_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\ +GROUP_NORM_8BITS_F32_IMPL_2D(I8_F32toI8, vxc_char16, vxc_char16)\n\ +"; /* end of group_normalization_0_vx*/ -static const char group_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char group_normalization_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_zp;\n\ +\n\ +#define GROUP_NORM_8BITSTOF16_IMPL(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_array_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + src_type src0; \\\n\ + vxc_short8 src1, outval; \\\n\ + vxc_half8 scale_h, dst; \\\n\ + float scale_vari, bias_val; \\\n\ + vxc_float4 bias_f, scale_f; \\\n\ + \\\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float alpha = scale_vari * input_scale; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + norm = alpha * tmpData2 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData3 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GROUP_NORM_8BITSTOF16_IMPL(U8_F16toF16, vxc_uchar16)\n\ +GROUP_NORM_8BITSTOF16_IMPL(I8_F16toF16, vxc_char16)\n\ +\n\ +\n\ +#define GROUP_NORM_8BITSTOF16_IMPL_2D(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_array_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int2 coord = (int2)(get_global_id(0), gidz); \\\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\ + src_type src0; \\\n\ + vxc_short8 src1, outval; \\\n\ + vxc_half8 scale_h, dst; \\\n\ + float scale_vari, bias_val; \\\n\ + vxc_float4 bias_f, scale_f; \\\n\ + \\\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float alpha = scale_vari; \\\n\ + float alpha = scale_vari * input_scale; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + norm = alpha * tmpData2 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData3 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GROUP_NORM_8BITSTOF16_IMPL_2D(U8_F16toF16, vxc_uchar16)\n\ +GROUP_NORM_8BITSTOF16_IMPL_2D(I8_F16toF16, vxc_char16)\n\ +\n\ +#define GROUP_NORM_8TOF16_F32_IMPL(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + src_type src0; \\\n\ + vxc_short8 outval; \\\n\ + vxc_half8 dst; \\\n\ + float scale_vari, bias_val; \\\n\ + vxc_float4 bias_f, scale_f; \\\n\ + \\\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float alpha = scale_vari * input_scale; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + norm = alpha * tmpData2 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData3 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GROUP_NORM_8TOF16_F32_IMPL(U8_F32toF16, vxc_uchar16)\n\ +GROUP_NORM_8TOF16_F32_IMPL(I8_F32toF16, vxc_char16)\n\ +\n\ +#define GROUP_NORM_8TOF16_F32_IMPL_2D(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int2 coord = (int2)(get_global_id(0), gidz); \\\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\ + src_type src0; \\\n\ + vxc_short8 outval; \\\n\ + vxc_half8 dst; \\\n\ + float scale_vari, bias_val; \\\n\ + vxc_float4 bias_f, scale_f; \\\n\ + \\\n\ + vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float alpha = scale_vari * input_scale; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + norm = alpha * tmpData2 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData3 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GROUP_NORM_8TOF16_F32_IMPL_2D(U8_F32toF16, vxc_uchar16)\n\ +GROUP_NORM_8TOF16_F32_IMPL_2D(I8_F32toF16, vxc_char16)\n\ +"; /* end of group_normalization_1_vx*/ + +static const char group_normalization_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ _viv_uniform int height;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\ _viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ +_viv_uniform float input_scale2;\n\ +_viv_uniform float input_zp;\n\ +_viv_uniform float sum_x_tail;\n\ +_viv_uniform float sum_x2_tail0;\n\ +_viv_uniform float sum_x2_tail1;\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_uchar16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ \n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +#define GROUP_NORM_SUMS_16BITS_IMPL(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 3; \\\n\ + int lidx = get_local_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, 0, gidz, 0); \\\n\ + vxc_short8 src0; \\\n\ + src_type in_h; \\\n\ + float4 sumsqr; \\\n\ + float4 tmpSumSqr = (float4)(0); \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + if(gidx < width) \\\n\ + { \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\ + tmpSumSqr += sumsqr; \\\n\ + } \\\n\ + tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \\\n\ + tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \\\n\ + } \\\n\ + \\\n\ + lcl_sum[lidx] = tmpSumSqr.x; \\\n\ + lcl_sqr[lidx] = tmpSumSqr.y; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + float sum = 0; \\\n\ + float sqr = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + \\\n\ + float4 data = (float4)(sum, sqr, 0, 0); \\\n\ + write_imagef(output, coord_out, data); \\\n\ + } \\\n\ }\n\ +GROUP_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)\n\ +GROUP_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_uchar16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +#define GROUP_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sums_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 3; \\\n\ + int lidx = get_local_id(0); \\\n\ + \\\n\ + int2 coord = (int2)(gidx, get_global_id(1)); \\\n\ + vxc_short8 src0; \\\n\ + src_type in_h; \\\n\ + float4 sumsqr = (float4)(0); \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + \\\n\ + if(gidx < width) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\ + sumsqr.y = sumsqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sumsqr.x; \\\n\ + sumsqr.x = sumsqr.x * input_scale + sum_x_tail; \\\n\ + } \\\n\ + \\\n\ + lcl_sum[lidx] = sumsqr.x; \\\n\ + lcl_sqr[lidx] = sumsqr.y; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + float sum = 0; \\\n\ + float sqr = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + \\\n\ + float4 data = (float4)(sum, sqr, 0, 0); \\\n\ + write_imagef(output, coord_out, data); \\\n\ + } \\\n\ }\n\ +GROUP_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)\n\ +GROUP_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidy = get_global_id(1);\n\ - int gidz = get_global_id(2);\n\ - int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\ - int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\ - vxc_uchar16 src0;\n\ - vxc_short8 outval;\n\ - vxc_half8 dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ +#define GROUP_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_array_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + vxc_short8 src0; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + src_type in_h; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f; \\\n\ + \\\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + copy_type outval; \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + dst_type dst; \\\n\ + \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + \\\n\ + float4 norm; \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ +GROUP_NORM_16BITS_IMPL(F16_F16toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +GROUP_NORM_16BITS_IMPL(F16_F16toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +GROUP_NORM_16BITS_IMPL(F16_F16toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +GROUP_NORM_16BITS_IMPL(F16_F16toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +GROUP_NORM_16BITS_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +GROUP_NORM_16BITS_IMPL(I16_F16toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int2 coord = (int2)(get_global_id(0), gidz);\n\ - int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\ - vxc_uchar16 src0;\n\ - vxc_short8 outval;\n\ - vxc_half8 dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f;\n\ -\n\ - vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\ - bias_f = read_imagef(bias, coord_para.xy);\n\ - scale_f = read_imagef(scale, coord_para.xy);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +#define GROUP_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_array_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int2 coord = (int2)(get_global_id(0), gidz); \\\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\ + vxc_short8 src0; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + src_type in_h; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f; \\\n\ + \\\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + VXC_ReadImage(src1, scale, coord_para.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + copy_type outval; \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + float alpha = output_scale * scale_vari; \\\n\ + bias_val = input_scale * (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + dst_type dst; \\\n\ + \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + float4 norm; \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -"; /* end of group_normalization_u8_f16_vx*/ +GROUP_NORM_16BITS_IMPL_2D(F16_F16toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +GROUP_NORM_16BITS_IMPL_2D(F16_F16toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +GROUP_NORM_16BITS_IMPL_2D(F16_F16toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +GROUP_NORM_16BITS_IMPL_2D(F16_F16toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +GROUP_NORM_16BITS_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +GROUP_NORM_16BITS_IMPL_2D(I16_F16toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +\n\ +#define GROUP_NORM_16BITS_F32_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidy = get_global_id(1); \\\n\ + int gidz = get_global_id(2); \\\n\ + int4 coord = (int4)(get_global_id(0), gidy, gidz, 0); \\\n\ + int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0); \\\n\ + vxc_short8 src0; \\\n\ + src_type in_h; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f; \\\n\ + \\\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + copy_type outval; \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + dst_type dst; \\\n\ + \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + \\\n\ + float4 norm; \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GROUP_NORM_16BITS_F32_IMPL(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +GROUP_NORM_16BITS_F32_IMPL(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +GROUP_NORM_16BITS_F32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +GROUP_NORM_16BITS_F32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +GROUP_NORM_16BITS_F32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +GROUP_NORM_16BITS_F32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +\n\ +#define GROUP_NORM_16BITS_F32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int is2D, float rSpaceOrg, int pStride) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int2 coord = (int2)(get_global_id(0), gidz); \\\n\ + int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0); \\\n\ + vxc_short8 src0; \\\n\ + src_type in_h; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f; \\\n\ + \\\n\ + float4 mean_vari = read_imagef(meanVari, coord_para.zy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + copy_type outval; \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + dst_type dst; \\\n\ + \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + float4 norm; \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +GROUP_NORM_16BITS_F32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +GROUP_NORM_16BITS_F32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +\n\ +"; /* end of group_normalization_2_vx*/ static const char grucell_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -10555,7 +10192,8 @@ __kernel void grucell_activation_z_h_F16_F16toF16_##act_name( \\\n\ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\ +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\ +GRUCELL_F16_F16TOF16(HSIGMOID, hard_sigmoid)\n\ \n\ _viv_uniform float hstate_in_scale;\n\ _viv_uniform float hstate_in_tail;\n\ @@ -10604,9 +10242,12 @@ __kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \\\n\ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ -GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8)\n\ -GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\ +GRUCELL_QNT_F16TO_QNT(U8, U8, SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_QNT(I8, I8, SIGMOID, sigmoid_func, vxc_char8, vxc_char8)\n\ +GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\ +GRUCELL_QNT_F16TO_QNT(U8, U8, HSIGMOID, hard_sigmoid, vxc_uchar8, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_QNT(I8, I8, HSIGMOID, hard_sigmoid, vxc_char8, vxc_char8)\n\ +GRUCELL_QNT_F16TO_QNT(I16, I16, HSIGMOID, hard_sigmoid, vxc_short8, vxc_short8)\n\ "; /* end of grucell_activation_z_h_vx*/ static const char grucell_cdnn_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -11455,7 +11096,8 @@ __kernel void grucell_h_times_activation_r_F16_F16toF16_##act_name( \\\n\ _viv_asm(COPY, dst, dst1, 8); \\\n\ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\ +GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\ +GRUCELL_F16_F16TOF16(HSIGMOID, hard_sigmoid)\n\ \n\ _viv_uniform float hstate_in_scale;\n\ _viv_uniform float hstate_in_tail;\n\ @@ -11492,9 +11134,12 @@ __kernel void grucell_h_times_activation_r_##name0##_F16toF16_##act_name( \\\n\ _viv_asm(COPY, dst, dst1, 8); \\\n\ VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -GRUCELL_QNT_F16TO_F16(U8, SIGMOID, sigmoid_func, vxc_uchar8)\n\ -GRUCELL_QNT_F16TO_F16(I8, SIGMOID, sigmoid_func, vxc_char8)\n\ -GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)\n\ +GRUCELL_QNT_F16TO_F16(U8, SIGMOID, sigmoid_func, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_F16(I8, SIGMOID, sigmoid_func, vxc_char8)\n\ +GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)\n\ +GRUCELL_QNT_F16TO_F16(U8, HSIGMOID, hard_sigmoid, vxc_uchar8)\n\ +GRUCELL_QNT_F16TO_F16(I8, HSIGMOID, hard_sigmoid, vxc_char8)\n\ +GRUCELL_QNT_F16TO_F16(I16, HSIGMOID, hard_sigmoid, vxc_short8)\n\ "; /* end of grucell_h_times_activation_r_vx*/ static const char grucell_reset_after_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ @@ -11790,1141 +11435,181 @@ __kernel void hswish_BF16toBF16_2D(\n\ }\n\ "; /* end of hswish_vx*/ -static const char instance_normalization_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char instance_normalization_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ _viv_uniform int height;\n\ -_viv_uniform float dimRatio;\n\ +_viv_uniform float inv_multiplier;\n\ _viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ \n\ -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniSum_X_X2_16x2;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_scale2;\n\ +_viv_uniform float input_zp;\n\ +_viv_uniform float sum_x_tail;\n\ +_viv_uniform float sum_x2_tail0;\n\ +_viv_uniform float sum_x2_tail1;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16(\n\ - image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - vxc_float4 sumsqr;\n\ - vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ +_viv_uniform VXC_512Bits uniSumX_16x1;\n\ +_viv_uniform VXC_512Bits uniSumX2_16x1;\n\ \n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - tmpSumSqr += sumsqr;\n\ - }\n\ - }\n\ -\n\ - lcl_sum[lidx] = tmpSumSqr.x;\n\ - lcl_sqr[lidx] = tmpSumSqr.y;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - float sum = 0;\n\ - float sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D(\n\ - image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ -\n\ - int2 coord = (int2)(gidx, gidy);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - vxc_float4 sumsqr;\n\ - vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int endH = gidy + height;\n\ - if(gidx < width)\n\ - {\n\ - for(; coord.y < endH;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - tmpSumSqr += sumsqr;\n\ - }\n\ - }\n\ -\n\ - lcl_sum[lidx] = tmpSumSqr.x;\n\ - lcl_sqr[lidx] = tmpSumSqr.y;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - float sum = 0;\n\ - float sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ - bias_f = read_imagef(bias, coord_para);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ -\n\ - coord_in.y ++;\n\ -\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ -\n\ - vxc_float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - int endH = gidy + height;\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ - bias_f = read_imagef(bias, coord_para);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - for(; coord.y < endH; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ -\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ - vxc_float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of instance_normalization_f16_vx*/ - -static const char instance_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -\n\ -_viv_uniform float inFlScale_s2;\n\ -_viv_uniform float input_fl_scale;\n\ -_viv_uniform float inOut_fl_scale;\n\ -_viv_uniform float output_fl_scale;\n\ -\n\ -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ - vxc_short8 src0;\n\ - float sum = 0, sqr = 0;\n\ - vxc_float4 sumsqr = (vxc_float4)(0);\n\ - vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ +#define INSTANCE_NORM_SUMS_8BITS_IMPL(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, 0, gidz, gidz); \\\n\ + src_type src0; \\\n\ + float2 sums_f32 = 0; \\\n\ + int2 sums = 0, sum_x_x2; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + if(gidx < width) \\\n\ + { \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - //tmpSumSqr += sumsqr;\n\ - tmpSumSqr.x += sumsqr.x;\n\ - sqr += (sumsqr.y * inFlScale_s2);\n\ - }\n\ - sum = tmpSumSqr.x * input_fl_scale;\n\ - //sqr = tmpSumSqr.y * inFlScale_s2;\n\ - }\n\ -\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + sums = sums + sum_x_x2; \\\n\ + } \\\n\ + sums_f32 = convert_float2(sums); \\\n\ + sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \\\n\ + sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \\\n\ + } \\\n\ + lcl_sum[lidx] = sums_f32.x; \\\n\ + lcl_sqr[lidx] = sums_f32.y; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + float sum = 0, sqr = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 data = (float4)(sum, sqr, 0, 0); \\\n\ + write_imagef(output, coord_out, data); \\\n\ + } \\\n\ }\n\ +INSTANCE_NORM_SUMS_8BITS_IMPL(U8, vxc_uchar16)\n\ +INSTANCE_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I16_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ -\n\ - int2 coord = (int2)(gidx, gidy);\n\ - vxc_short8 src0;\n\ - float sum = 0, sqr = 0;\n\ - vxc_float4 sumsqr = (vxc_float4)(0);\n\ - vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int endH = gidy + height;\n\ - if(gidx < width)\n\ - {\n\ - for(; coord.y < endH;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, 0,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - //tmpSumSqr += sumsqr;\n\ - tmpSumSqr.x += sumsqr.x;\n\ - sqr += (sumsqr.y * inFlScale_s2);\n\ - }\n\ - sum = tmpSumSqr.x * input_fl_scale;\n\ - //sqr = tmpSumSqr.y * inFlScale_s2;\n\ - }\n\ -\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - //sum += lcl_sum[i];\n\ - //sqr += lcl_sqr[i];\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ +#define INSTANCE_NORM_SUMS_8BITS_IMPL_2D(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 4; \\\n\ + int lidx = get_local_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int gidy = gidz * height; \\\n\ + \\\n\ + int2 coord = (int2)(gidx, gidy); \\\n\ + src_type src0; \\\n\ + float2 sums_f32 = 0; \\\n\ + int2 sums = 0, sum_x_x2; \\\n\ + int endH = gidy + height; \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + if (gidx < width) \\\n\ + { \\\n\ + for(; coord.y < endH;) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, 0, \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + sums = sums + sum_x_x2; \\\n\ + } \\\n\ + sums_f32 = convert_float2(sums); \\\n\ + sums_f32.y = sums_f32.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * sums_f32.x; \\\n\ + sums_f32.x = sums_f32.x * input_scale + sum_x_tail; \\\n\ + } \\\n\ + lcl_sum[lidx] = sums_f32.x; \\\n\ + lcl_sqr[lidx] = sums_f32.y; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + float sum = 0, sqr = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + float4 data = (float4)(sum, sqr, 0, 0); \\\n\ + write_imagef(output, coord_out, data); \\\n\ + } \\\n\ }\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ -\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toF16_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - int endH = gidy + height;\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - for(; coord.y < endH; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, 0,\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - vxc_short8 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toInt16_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, src2, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16toI16_2D(\n\ - image2d_array_t input,\n\ - image2d_array_t bias,\n\ - image2d_array_t scale,\n\ - image2d_t meanVari,\n\ - image2d_array_t output,\n\ - float eps,\n\ - int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int2 coord = (int2)(get_global_id(0), gidy);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - int endH = gidy + height;\n\ - vxc_short8 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - for(; coord.y < endH; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, 0,\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toInt16_2x8);\n\ - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of instance_normalization_i16_vx*/ - -static const char instance_normalization_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniSumInt8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;\n\ -_viv_uniform float inFlScale_s2;\n\ -_viv_uniform float input_fl_scale;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\ -\n\ -_viv_uniform float inOut_fl_scale;\n\ -_viv_uniform float output_fl_scale;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8(\n\ - image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ - vxc_char16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\ - tmpSum += (tmpSum1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\ - tmpSqr += (tmpSqr1);\n\ - }\n\ - sqr = tmpSqr * inFlScale_s2;\n\ - sum = tmpSum * input_fl_scale;\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D(\n\ - image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ -\n\ - int2 coord = (int2)(gidx, gidy);\n\ - vxc_char16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int endH = gidy + height;\n\ - if(gidx < width)\n\ - {\n\ - for(; coord.y < endH;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\ - tmpSum += (tmpSum1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\ - tmpSqr += (tmpSqr1);\n\ - }\n\ - sqr = tmpSqr * inFlScale_s2;\n\ - sum = tmpSum * input_fl_scale;\n\ - }\n\ -\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - vxc_char16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ - bias_f = read_imagef(bias, coord_para);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - coord_para = coord;\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_para.z, baseAddr);\n\ -\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_para.xy = coord.xy;\n\ - coord.y++;\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ -\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_para.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - int endH = gidy + height;\n\ - vxc_char16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - for(; coord.y < endH;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_para = coord;\n\ - coord.y++;\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_para.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - vxc_char16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int2 coord = (int2)(get_global_id(0), gidy);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - int endH = gidy + height;\n\ - vxc_char16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - for(; coord.y < endH; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -"; /* end of instance_normalization_i8_vx*/ - -static const char instance_normalization_scale_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int height;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform float scale_inOut;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform int output_ZP;\n\ -_viv_uniform float inOut_fl_scale;\n\ -_viv_uniform float output_fl_scale;\n\ -_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\ -\n\ -#define INSTANCENORM_8BITS_F32(src1_type_name, read_type) \\\n\ -__kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \\\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \\\n\ - image2d_array_t output, float eps, int rsFlg) \\\n\ +INSTANCE_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)\n\ +INSTANCE_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)\n\ +\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\ +#define INSTANCE_NORM_8BITS_IMPL(name, src_type, dst_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ - int2 coord_para = (int2)(gidz, 0); \\\n\ - read_type src0, src2; \\\n\ + int2 coord_para = (int2)(0, gidz); \\\n\ + src_type src0; \\\n\ + dst_type dst; \\\n\ float scale_vari, bias_val; \\\n\ - vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ - \\\n\ - Image img1 = create_image_from_image2d(bias, 4); \\\n\ - Image img2 = create_image_from_image2d(scale, 4); \\\n\ - Image img3 = create_image_from_image2d(meanVari, 4); \\\n\ - __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ - __global float* scal_ptr = (__global float*)img2.ptr; \\\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \\\n\ - __global float4* vari_ptr = (__global float4*)sumVari_ptr; \\\n\ - \\\n\ - float bval = bias_ptr[gidz]; \\\n\ - float sval = scal_ptr[gidz]; \\\n\ + float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ \\\n\ + scale_f = read_imagef(scale, coord_para); \\\n\ + bias_f = read_imagef(bias, coord_para); \\\n\ for(int i = 0; i < group_num; i++) \\\n\ { \\\n\ - mean_vari += vari_ptr[i]; \\\n\ + mean_vari += read_imagef(meanVari, coord_para); \\\n\ + coord_para.x += 4; \\\n\ } \\\n\ - mean_vari *= dimRatio; \\\n\ + mean_vari *= inv_multiplier; \\\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ \\\n\ - scale_vari = sval * mean_vari.s1; \\\n\ - short zp = inputZP; \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ vxc_int4 tmpVal0, tmpVal1; \\\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ - float alpha = scale_inOut * scale_vari; \\\n\ - bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ \\\n\ int8 input_desc, output_desc; \\\n\ _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ @@ -12936,240 +11621,543 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \\\n\ \\\n\ for(coord.y = 0; coord.y < height; coord.y++) \\\n\ { \\\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ coord_in.y ++; \\\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ - uniConvert1stUint8SubZpToFp32_4x4); \\\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ - uniConvert2ndUint8SubZpToFp32_4x4); \\\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ - uniConvert3rdUint8SubZpToFp32_4x4); \\\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ - uniConvert4thUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ norm = tmpData0 * alpha + bias_val; \\\n\ tmpVal0 = convert_int4_rte(norm); \\\n\ norm = tmpData1 * alpha + bias_val; \\\n\ tmpVal1 = convert_int4_rte(norm); \\\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ norm = tmpData2 * alpha + bias_val; \\\n\ tmpVal0 = convert_int4_rte(norm); \\\n\ norm = tmpData3 * alpha + bias_val; \\\n\ tmpVal1 = convert_int4_rte(norm); \\\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ } \\\n\ }\n\ -INSTANCENORM_8BITS_F32(U8, vxc_uchar16)\n\ -INSTANCENORM_8BITS_F32(I8, vxc_char16)\n\ +INSTANCE_NORM_8BITS_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\ +INSTANCE_NORM_8BITS_IMPL(I8_F32toI8, vxc_char16, vxc_char16)\n\ \n\ -#define INSTANCENORM_8BITS_F32_2D(src1_type_name, read_type) \\\n\ -__kernel void instance_norm_##src1_type_name##F32to##src1_type_name##_2D( \\\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \\\n\ - image2d_array_t output, float eps, int rsFlg) \\\n\ +#define INSTANCE_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ { \\\n\ int gidz = get_global_id(1); \\\n\ int gidy = gidz * height; \\\n\ int2 coord = (int2)(get_global_id(0), gidy); \\\n\ - int2 coord_para = (int2)(gidz, 0); \\\n\ + int2 coord_para = (int2)(0, gidz); \\\n\ int endH = gidy + height; \\\n\ - read_type src0, src2; \\\n\ + src_type src0; \\\n\ + dst_type dst; \\\n\ float scale_vari, bias_val; \\\n\ - vxc_float4 mean_vari = (vxc_float4)(0); \\\n\ - \\\n\ - Image img1 = create_image_from_image2d(bias, 4); \\\n\ - Image img2 = create_image_from_image2d(scale, 4); \\\n\ - Image img3 = create_image_from_image2d(meanVari, 4); \\\n\ - __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ - __global float* scal_ptr = (__global float*)img2.ptr; \\\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \\\n\ - __global float4* vari_ptr = (__global float4*)sumVari_ptr; \\\n\ - \\\n\ - float bval = bias_ptr[gidz]; \\\n\ - float sval = scal_ptr[gidz]; \\\n\ + float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ \\\n\ + scale_f = read_imagef(scale, coord_para); \\\n\ + bias_f = read_imagef(bias, coord_para); \\\n\ for(int i = 0; i < group_num; i++) \\\n\ { \\\n\ - mean_vari += vari_ptr[i]; \\\n\ + mean_vari += read_imagef(meanVari, coord_para); \\\n\ + coord_para.x += 4; \\\n\ } \\\n\ - \\\n\ - mean_vari *= dimRatio; \\\n\ + mean_vari *= inv_multiplier; \\\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ \\\n\ - scale_vari = sval * mean_vari.s1; \\\n\ - short zp = inputZP; \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ vxc_int4 tmpVal0, tmpVal1; \\\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ - float alpha = scale_inOut * scale_vari; \\\n\ - bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ \\\n\ for(; coord.y < endH; coord.y++) \\\n\ { \\\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ - uniConvert1stUint8SubZpToFp32_4x4); \\\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ - uniConvert2ndUint8SubZpToFp32_4x4); \\\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ - uniConvert3rdUint8SubZpToFp32_4x4); \\\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ - uniConvert4thUint8SubZpToFp32_4x4); \\\n\ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ norm = tmpData0 * alpha + bias_val; \\\n\ tmpVal0 = convert_int4_rte(norm); \\\n\ norm = tmpData1 * alpha + bias_val; \\\n\ tmpVal1 = convert_int4_rte(norm); \\\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ norm = tmpData2 * alpha + bias_val; \\\n\ tmpVal0 = convert_int4_rte(norm); \\\n\ norm = tmpData3 * alpha + bias_val; \\\n\ tmpVal1 = convert_int4_rte(norm); \\\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\ - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ } \\\n\ }\n\ -INSTANCENORM_8BITS_F32_2D(U8, vxc_uchar16)\n\ -INSTANCENORM_8BITS_F32_2D(I8, vxc_char16)\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int2 coord_para = (int2)(gidz, 0);\n\ - vxc_short8 src0, src2;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ - Image img3 = create_image_from_image2d(meanVari, 4);\n\ - __global float* bias_ptr = (__global float*)img1.ptr;\n\ - __global float* scal_ptr = (__global float*)img2.ptr;\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\ - __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ -\n\ - float bval = bias_ptr[gidz];\n\ - float sval = scal_ptr[gidz];\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += vari_ptr[i];\n\ - }\n\ -\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = sval * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toInt16_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, src2, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale,\n\ - image2d_t meanVari, image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int2 coord = (int2)(get_global_id(0), gidy);\n\ - int2 coord_para = (int2)(gidz, 0);\n\ - int endH = gidy + height;\n\ - vxc_short8 src0, src2;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ - Image img3 = create_image_from_image2d(meanVari, 4);\n\ - __global float* bias_ptr = (__global float*)img1.ptr;\n\ - __global float* scal_ptr = (__global float*)img2.ptr;\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\ - __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ -\n\ - float bval = bias_ptr[gidz];\n\ - float sval = scal_ptr[gidz];\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += vari_ptr[i];\n\ - }\n\ -\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = sval * mean_vari.s1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - float alpha = inOut_fl_scale * scale_vari;\n\ - bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;\n\ -\n\ - for(; coord.y < endH; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Fst_4x4);\n\ - VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertInt16Fp32Secd_4x4);\n\ - vxc_float4 norm;\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toInt16_2x8);\n\ - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of instance_normalization_scale_f32_vx*/ +INSTANCE_NORM_8BITS_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\ +INSTANCE_NORM_8BITS_IMPL_2D(I8_F32toI8, vxc_char16, vxc_char16)"; /* end of instance_normalization_0_vx*/ -static const char instance_normalization_scale_f32_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char instance_normalization_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ _viv_uniform int height;\n\ -_viv_uniform float dimRatio;\n\ +_viv_uniform float inv_multiplier;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_zp;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\ +\n\ +#define INSTANCE_NORM_8_TO_F16_IMPL(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\ + src_type src0; \\\n\ + vxc_short8 outval; \\\n\ + vxc_half8 dst; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\ + coord_para.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float alpha = scale_vari * input_scale; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + \\\n\ + coord_para = coord; \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_para.z, baseAddr); \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_para.xy = coord.xy; \\\n\ + coord.y++; \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + coord_para.x += 8; \\\n\ + norm = alpha * tmpData2 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData3 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +INSTANCE_NORM_8_TO_F16_IMPL(U8_F32toF16, vxc_uchar16)\n\ +INSTANCE_NORM_8_TO_F16_IMPL(I8_F32toF16, vxc_char16)\n\ +\n\ +#define INSTANCE_NORM_8_TO_F16_IMPL_2D(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int gidy = gidz * height; \\\n\ + int4 coord = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\ + int endH = gidy + height; \\\n\ + src_type src0; \\\n\ + vxc_short8 outval; \\\n\ + vxc_half8 dst; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\ + coord_para.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float alpha = scale_vari * input_scale; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + for(; coord.y < endH;) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_para = coord; \\\n\ + coord.y++; \\\n\ + VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\ + VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_para.x += 8; \\\n\ + norm = alpha * tmpData2 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData3 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +INSTANCE_NORM_8_TO_F16_IMPL_2D(U8_F32toF16, vxc_uchar16)\n\ +INSTANCE_NORM_8_TO_F16_IMPL_2D(I8_F32toF16, vxc_char16)\n\ +"; /* end of instance_normalization_1_vx*/ + +static const char instance_normalization_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform float inv_multiplier;\n\ +_viv_uniform int group_num;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\ +_viv_uniform float input_scale;\n\ +_viv_uniform float input_scale2;\n\ +_viv_uniform float input_zp;\n\ +_viv_uniform float sum_x_tail;\n\ +_viv_uniform float sum_x2_tail0;\n\ +_viv_uniform float sum_x2_tail1;\n\ +\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +#define INSTANCE_NORM_SUMS_16BITS_IMPL(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 3; \\\n\ + int lidx = get_local_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(gidx, 0, gidz, gidz); \\\n\ + vxc_short8 src0; \\\n\ + src_type in_h; \\\n\ + float4 sumsqr; \\\n\ + float4 tmpSumSqr = (float4)(0); \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + \\\n\ + int8 input_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + if(gidx < width) \\\n\ + { \\\n\ + for(coord.y = 0; coord.y < height;) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\ + uniSum_X_X2_8x2); \\\n\ + tmpSumSqr += sumsqr; \\\n\ + } \\\n\ + tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \\\n\ + tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \\\n\ + } \\\n\ + \\\n\ + lcl_sum[lidx] = tmpSumSqr.x; \\\n\ + lcl_sqr[lidx] = tmpSumSqr.y; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + float sum = 0; \\\n\ + float sqr = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + \\\n\ + float4 data = (float4)(sum, sqr, 0, 0); \\\n\ + write_imagef(output, coord_out, data); \\\n\ + } \\\n\ +}\n\ +INSTANCE_NORM_SUMS_16BITS_IMPL(F16, vxc_half8)\n\ +INSTANCE_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)\n\ +\n\ +#define INSTANCE_NORM_SUMS_16BITS_IMPL_2D(name, src_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ +{ \\\n\ + int gidx = get_global_id(0) << 3; \\\n\ + int lidx = get_local_id(0); \\\n\ + int gidz = get_global_id(1); \\\n\ + int gidy = gidz * height; \\\n\ + \\\n\ + int2 coord = (int2)(gidx, gidy); \\\n\ + vxc_short8 src0; \\\n\ + src_type in_h; \\\n\ + float4 sumsqr; \\\n\ + float4 tmpSumSqr = (float4)(0); \\\n\ + \\\n\ + __local float lcl_sum[16]; \\\n\ + __local float lcl_sqr[16]; \\\n\ + \\\n\ + int endH = gidy + height; \\\n\ + if(gidx < width) \\\n\ + { \\\n\ + for(; coord.y < endH;) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.y++; \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\ + uniSum_X_X2_8x2); \\\n\ + tmpSumSqr += sumsqr; \\\n\ + } \\\n\ + tmpSumSqr.y = tmpSumSqr.y * input_scale2 + sum_x2_tail0 + sum_x2_tail1 * tmpSumSqr.x; \\\n\ + tmpSumSqr.x = tmpSumSqr.x * input_scale + sum_x_tail; \\\n\ + } \\\n\ + \\\n\ + lcl_sum[lidx] = tmpSumSqr.x; \\\n\ + lcl_sqr[lidx] = tmpSumSqr.y; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + \\\n\ + int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0); \\\n\ + if(lidx == 0) \\\n\ + { \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + __local float4* tmp_sum = (__local float4*)lcl_sum; \\\n\ + __local float4* tmp_sqr = (__local float4*)lcl_sqr; \\\n\ + \\\n\ + float sum = 0; \\\n\ + float sqr = 0; \\\n\ + for(int i = 0; i < 4; i++) \\\n\ + { \\\n\ + sum += dot(tmp_sum[i], one); \\\n\ + sqr += dot(tmp_sqr[i], one); \\\n\ + } \\\n\ + \\\n\ + float4 data = (float4)(sum, sqr, 0, 0); \\\n\ + write_imagef(output, coord_out, data); \\\n\ + } \\\n\ +}\n\ +INSTANCE_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)\n\ +INSTANCE_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)\n\ +\n\ +#define INSTANCE_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\ + int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\ + vxc_short8 src0; \\\n\ + src_type in_h; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\ + coord_para.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + copy_type outval; \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + dst_type dst; \\\n\ + \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr); \\\n\ + \\\n\ + for(coord.y = 0; coord.y < height; coord.y++) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + \\\n\ + coord_in.y ++; \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + \\\n\ + float4 norm; \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +INSTANCE_NORM_16BITS_IMPL(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +INSTANCE_NORM_16BITS_IMPL(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +INSTANCE_NORM_16BITS_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +INSTANCE_NORM_16BITS_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +INSTANCE_NORM_16BITS_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +INSTANCE_NORM_16BITS_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +\n\ +#define INSTANCE_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __read_only image2d_t meanVari, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, int rs_flag) \\\n\ +{ \\\n\ + int gidz = get_global_id(1); \\\n\ + int gidy = gidz * height; \\\n\ + int4 coord = (int4)(get_global_id(0), gidy, 0, 0); \\\n\ + int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\ + int endH = gidy + height; \\\n\ + vxc_short8 src0; \\\n\ + src_type in_h; \\\n\ + float scale_vari, bias_val; \\\n\ + float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\ + \\\n\ + scale_f = read_imagef(scale, coord_para.xy); \\\n\ + bias_f = read_imagef(bias, coord_para.xy); \\\n\ + \\\n\ + for(int i = 0; i < group_num; i++) \\\n\ + { \\\n\ + mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\ + coord_para.x += 4; \\\n\ + } \\\n\ + mean_vari *= inv_multiplier; \\\n\ + mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\ + mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\ + \\\n\ + scale_vari = scale_f.s0 * mean_vari.s1; \\\n\ + float alpha = input_scale * output_scale * scale_vari; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + copy_type outval; \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\ + bias_val = bias_val - input_zp * alpha; \\\n\ + dst_type dst; \\\n\ + \\\n\ + for(; coord.y < endH; coord.y++) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, in_h, src0, 16); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\ + float4 norm; \\\n\ + norm = alpha * tmpData0 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + norm = alpha * tmpData1 + bias_val; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, outval, dst, 16); \\\n\ + VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +INSTANCE_NORM_16BITS_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +"; /* end of instance_normalization_2_vx*/ + +static const char instance_normalization_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int width;\n\ +_viv_uniform int height;\n\ +_viv_uniform float inv_multiplier;\n\ _viv_uniform int group_num;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ -constant vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ -constant float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16(\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16(\n\ image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidx = get_global_id(0) << 3;\n\ @@ -13178,8 +12166,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ vxc_short8 src0, src1, src2;\n\ float4 srcA, srcB;\n\ - vxc_float sum = 0, sqr = 0;\n\ -\n\ + float sum = 0, sqr = 0;\n\ + float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ \n\ @@ -13229,7 +12218,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean }\n\ }\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16_2D(\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16_2D(\n\ image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ {\n\ int gidx = get_global_id(0) << 3;\n\ @@ -13240,7 +12229,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean int2 coord = (int2)(gidx, gidy);\n\ vxc_short8 src0, src1, src2;\n\ float4 srcA, srcB;\n\ - vxc_float sum = 0, sqr = 0;\n\ + float sum = 0, sqr = 0;\n\ + float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\ \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ @@ -13287,7 +12278,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean }\n\ }\n\ \n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16(\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16_F32toBF16(\n\ image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ image2d_array_t output, float eps, int rsFlg)\n\ {\n\ @@ -13296,30 +12287,26 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ vxc_short8 src0, src1, src2;\n\ float scale_vari, bias_val;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ + float4 mean_vari = (float4)(0);\n\ \n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ Image img3 = create_image_from_image2d(meanVari, 4);\n\ - __global float* bias_ptr = (__global float*)img1.ptr;\n\ - __global float* scal_ptr = (__global float*)img2.ptr;\n\ __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));\n\ __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ \n\ - float bval = bias_ptr[gidz];\n\ - float sval = scal_ptr[gidz];\n\ + float sval = read_imagef(scale, coord.yz).x;\n\ + float bval = read_imagef(bias, coord.yz).x;\n\ \n\ for(int i = 0; i < group_num; i++)\n\ {\n\ mean_vari += vari_ptr[i];\n\ }\n\ \n\ - mean_vari *= dimRatio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = sval * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ + float4 tmpData0, tmpData1;\n\ bias_val = (bval - scale_vari * mean_vari.s0);\n\ \n\ int8 input_desc, output_desc;\n\ @@ -13343,7 +12330,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 _viv_asm(COPY, tmpData0, src1, 16);\n\ _viv_asm(COPY, tmpData1, src2, 16);\n\ \n\ - vxc_float4 norm;\n\ + float4 norm;\n\ norm = scale_vari * tmpData0 + bias_val;\n\ _viv_asm(COPY, src0, norm, 16);\n\ norm = scale_vari * tmpData1 + bias_val;\n\ @@ -13365,30 +12352,26 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 int endH = gidy + height;\n\ vxc_short8 src0, src1, src2;\n\ float scale_vari, bias_val;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ + float4 mean_vari = (float4)(0);\n\ \n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ Image img3 = create_image_from_image2d(meanVari, 4);\n\ - __global float* bias_ptr = (__global float*)img1.ptr;\n\ - __global float* scal_ptr = (__global float*)img2.ptr;\n\ __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\ __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ \n\ - float bval = bias_ptr[gidz];\n\ - float sval = scal_ptr[gidz];\n\ + float sval = read_imagef(scale, coord_para.yx).x;\n\ + float bval = read_imagef(bias, coord_para.yx).x;\n\ \n\ for(int i = 0; i < group_num; i++)\n\ {\n\ mean_vari += vari_ptr[i];\n\ }\n\ \n\ - mean_vari *= dimRatio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = sval * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ + float4 tmpData0, tmpData1;\n\ bias_val = (bval - scale_vari * mean_vari.s0);\n\ \n\ for(; coord.y < endH; coord.y++)\n\ @@ -13402,7 +12385,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 _viv_asm(COPY, tmpData0, src1, 16);\n\ _viv_asm(COPY, tmpData1, src2, 16);\n\ \n\ - vxc_float4 norm;\n\ + float4 norm;\n\ norm = scale_vari * tmpData0 + bias_val;\n\ _viv_asm(COPY, src0, norm, 16);\n\ norm = scale_vari * tmpData1 + bias_val;\n\ @@ -13410,558 +12393,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16 VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ -}"; /* end of instance_normalization_scale_f32_bf16_vx*/ - -static const char instance_normalization_scale_f32_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int height;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ - Image img3 = create_image_from_image2d(meanVari, 4);\n\ - __global float* bias_ptr = (__global float*)img1.ptr;\n\ - __global float* scal_ptr = (__global float*)img2.ptr;\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));\n\ - __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ -\n\ - float bval = bias_ptr[gidz];\n\ - float sval = scal_ptr[gidz];\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += vari_ptr[i];\n\ - }\n\ -\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = sval * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - bias_val = (bval - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ -\n\ - coord_in.y ++;\n\ -\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ -\n\ - vxc_float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16_2D(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int2 coord = (int2)(get_global_id(0), gidy);\n\ - int2 coord_para = (int2)(gidz, 0);\n\ - int endH = gidy + height;\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ - Image img3 = create_image_from_image2d(meanVari, 4);\n\ - __global float* bias_ptr = (__global float*)img1.ptr;\n\ - __global float* scal_ptr = (__global float*)img2.ptr;\n\ - __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\ - __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\ -\n\ - float bval = bias_ptr[gidz];\n\ - float sval = scal_ptr[gidz];\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += vari_ptr[i];\n\ - }\n\ -\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = sval * mean_vari.s1;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - bias_val = (bval - scale_vari * mean_vari.s0);\n\ - vxc_half8 dst;\n\ -\n\ - for(; coord.y < endH; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ -\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertEndInt16Fp32_4x4);\n\ - vxc_float4 norm;\n\ - norm = scale_vari * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_vari * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of instance_normalization_scale_f32_f16_vx*/ - -static const char instance_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform float rowSumScale;\n\ -_viv_uniform float scale_inOut;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform int output_ZP;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8(\n\ - image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, gidz);\n\ - vxc_uchar16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, 0, \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ - }\n\ - sqr += (tmpSqr * e2InScale + rowSumScale);\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D(\n\ - image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ -\n\ - int2 coord = (int2)(gidx, gidy);\n\ - vxc_uchar16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ - int endH = gidy + height;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ - if(gidx < width)\n\ - {\n\ - for(; coord.y < endH;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, 0,\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ - }\n\ - sqr += (tmpSqr * e2InScale + rowSumScale);\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - vxc_uchar16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = scale_inOut * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, 0, \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int2 coord = (int2)(get_global_id(0), gidy);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - int endH = gidy + height;\n\ - vxc_uchar16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, 0,\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ -\n\ - bias_f = read_imagef(bias, coord_para);\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - float alpha = scale_inOut * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\ -\n\ - for(; coord.y < endH; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = tmpData0 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData1 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - norm = tmpData2 * alpha + bias_val;\n\ - tmpVal0 = convert_int4_rte(norm);\n\ - norm = tmpData3 * alpha + bias_val;\n\ - tmpVal1 = convert_int4_rte(norm);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of instance_normalization_u8_vx*/ - -static const char instance_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform int width;\n\ -_viv_uniform int height;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - vxc_uchar16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ - bias_f = read_imagef(bias, coord_para);\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - coord_para = coord;\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_para.z, baseAddr);\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_para.xy = coord.xy;\n\ - coord.y++;\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - coord_para.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps, int rsFlg)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\ - int4 coord_para = (int4)(gidz, 0, 0, 0);\n\ - int endH = gidy + height;\n\ - vxc_uchar16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - float scale_vari, bias_val;\n\ - vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\ -\n\ - VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\ - bias_f = read_imagef(bias, coord_para);\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_para.yx);\n\ - coord_para.y += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = scale_f.s0 * mean_vari.s1;\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ - float alpha = input_scale * scale_vari;\n\ - bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\ - for(; coord.y < endH;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_para = coord;\n\ - coord.y++;\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\ - norm = alpha * tmpData0 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData1 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_para.x += 8;\n\ - norm = alpha * tmpData2 + bias_val;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = alpha * tmpData3 + bias_val;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -"; /* end of instance_normalization_u8_f16_vx*/ +}"; /* end of instance_normalization_3_vx*/ static const char l2normalizescale_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -14059,6 +12491,7 @@ _viv_uniform float zpSqrt16x;\n\ _viv_uniform VXC_512Bits uniSumAll_16x1;\n\ _viv_uniform int inputZP;\n\ \n\ +\n\ #define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\ vxc_float4 rsqrt0;\\\n\ Image dst_img = create_image_from_image2d(output, 1); \\\n\ @@ -14108,31 +12541,31 @@ _viv_uniform int inputZP;\n\ dst_ptr[0] = dst.s0; \\\n\ break; \\\n\ case 2: \\\n\ - VXC_Vstore2(dst_ptr, 0, dst); \\\n\ + VXC_Vstore2(dst_ptr, 0, dst.s01); \\\n\ break; \\\n\ case 3: \\\n\ - VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\ break; \\\n\ case 4: \\\n\ - VXC_Vstore4(dst_ptr, 0, dst); \\\n\ + VXC_Vstore4(dst_ptr, 0, dst.0123); \\\n\ break; \\\n\ case 5: \\\n\ - VXC_Vstore2(dst_ptr, 0, dst); \\\n\ + VXC_Vstore2(dst_ptr, 0, dst.s01); \\\n\ dst.s012 = dst.s234; \\\n\ dst_ptr += 2; \\\n\ - VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\ break; \\\n\ case 6: \\\n\ - VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\ dst.s012 = dst.s345; \\\n\ dst_ptr += 3; \\\n\ - VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\ break; \\\n\ case 7: \\\n\ - VXC_Vstore4(dst_ptr, 0, dst); \\\n\ + VXC_Vstore4(dst_ptr, 0, dst.0123); \\\n\ dst.s012 = dst.s456; \\\n\ dst_ptr += 4; \\\n\ - VXC_Vstore3(dst_ptr, 0, dst); \\\n\ + VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\ break; \\\n\ default: \\\n\ VXC_Vstore8(dst_ptr, 0, dst); \\\n\ @@ -14142,16 +12575,13 @@ _viv_uniform int inputZP;\n\ } \\\n\ \n\ \n\ -#define L2NORMSCALE_AXIS0_2D(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \\\n\ +#define L2NORMSCALE_AXIS0(in0_name, in1_name, out_name, read_type, read_type2, src_type, INPUTSCALE, \\\n\ dst_type, convert_type, output_type, copy_type) \\\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ - void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \\\n\ + void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name \\\n\ (\\\n\ - __read_only image2d_t input,\\\n\ - __read_only image2d_t scale,\\\n\ - __write_only image2d_t output,\\\n\ - int axis\\\n\ - )\\\n\ + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output,\\\n\ + int axis )\\\n\ { \\\n\ int lidx = get_local_id(0); \\\n\ int offset = get_global_id(0); \\\n\ @@ -14201,19 +12631,15 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\ }\n\ \n\ -L2NORMSCALE_AXIS0_2D(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \\\n\ +L2NORMSCALE_AXIS0(F16, F16, F16, ushort, vxc_ushort8, vxc_half8, 1, \\\n\ ushort, half4, vxc_half8, vxc_ushort8)\n\ \n\ -#define L2NORMSCALE_AXIS0_QNT_2D(in0_name, in1_name, out_name,\\\n\ +#define L2NORMSCALE_AXIS0_QNT(in0_name, in1_name, out_name,\\\n\ src_type, src_scalar_type, dst_type, convert_type, output_type, copy_type) \\\n\ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ -void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \\\n\ +void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name \\\n\ (\\\n\ - __read_only image2d_t input,\\\n\ - __read_only image2d_t scale,\\\n\ - __write_only image2d_t output,\\\n\ - int axis\\\n\ - )\\\n\ + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\\\n\ { \\\n\ int lidx = get_local_id(0); \\\n\ int offset = get_global_id(0); \\\n\ @@ -14267,14 +12693,223 @@ void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \\\n\ L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \\\n\ }\n\ \n\ -L2NORMSCALE_AXIS0_QNT_2D(U8, F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8, vxc_ushort8)\n\ -L2NORMSCALE_AXIS0_QNT_2D(U8, F16, U8, vxc_uchar8, uchar, uchar, int4, vxc_uchar8, vxc_uchar8)\n\ -L2NORMSCALE_AXIS0_QNT_2D(I8, F16, F16, vxc_char8, char, ushort, half4, vxc_half8, vxc_ushort8)\n\ -L2NORMSCALE_AXIS0_QNT_2D(I8, F16, I8, vxc_char8, char, char, int4, vxc_char8, vxc_char8)\n\ -L2NORMSCALE_AXIS0_QNT_2D(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8, vxc_ushort8)\n\ -L2NORMSCALE_AXIS0_QNT_2D(I16, F16, I16, vxc_short8, short, short, int4, vxc_short8, vxc_short8)\n\ +L2NORMSCALE_AXIS0_QNT(U8, F16, F16, vxc_uchar8, uchar, ushort, half4, vxc_half8, vxc_ushort8)\n\ +L2NORMSCALE_AXIS0_QNT(U8, F16, U8, vxc_uchar8, uchar, uchar, int4, vxc_uchar8, vxc_uchar8)\n\ +L2NORMSCALE_AXIS0_QNT(I8, F16, F16, vxc_char8, char, ushort, half4, vxc_half8, vxc_ushort8)\n\ +L2NORMSCALE_AXIS0_QNT(I8, F16, I8, vxc_char8, char, char, int4, vxc_char8, vxc_char8)\n\ +L2NORMSCALE_AXIS0_QNT(I16, F16, F16, vxc_short8, short, ushort, half4, vxc_half8, vxc_ushort8)\n\ +L2NORMSCALE_AXIS0_QNT(I16, F16, I16, vxc_short8, short, short, int4, vxc_short8, vxc_short8)\n\ "; /* end of l2normalizescale_axis0_vx*/ +static const char l2normalizescale_axis0_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform int inputWidth;\n\ +_viv_uniform float output_ZP;\n\ +_viv_uniform float zP2x;\n\ +_viv_uniform int inputZP;\n\ +\n\ +_viv_uniform float inOutScale;\n\ +_viv_uniform float e2InScale;\n\ +_viv_uniform float zpSqr8x;\n\ +_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ +_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ +\n\ +_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1)))\n\ + void l2normalizescale_axis0_F16_F16toF16_2D(\n\ + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int lidx = get_local_id(0);\n\ + vxc_short8 src0, src1, dst;\n\ + vxc_half8 in_h, scale_h, tmpDst;\n\ + float sum = 0;\n\ + vxc_float4 scale_f0, scale_f1, sumsqr, tmpData0, tmpData1;\n\ + __local float lcl_sum[16];\n\ + float4 one = (float4)(1, 1, 1, 1);\n\ + for(; coord.x < inputWidth;)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + coord.x += 128;\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\ + uniFp16SumSqr_dp8x2);\n\ + sum += sumsqr.y;\n\ + }\n\ + lcl_sum[lidx] = sum;\n\ + barrier(CLK_LOCAL_MEM_FENCE);\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0];\n\ + float4 data0;\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3];\n\ + sum = dot(data0, one);\n\ + float alpha = rsqrt(sum);\n\ +\n\ + for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128)\n\ + {\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + _viv_asm(COPY, in_h, src0, 16);\n\ + _viv_asm(COPY, scale_h, src1, 16);\n\ + VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4);\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4);\n\ +\n\ + half4 tmpVal0, tmpVal1;\n\ + tmpData0 *= scale_f0 * alpha;\n\ + tmpData1 *= scale_f1 * alpha;\n\ + _viv_asm(CONV, tmpVal0, tmpData0);\n\ + _viv_asm(CONV, tmpVal1, tmpData1);\n\ + VXC_DP2x8(tmpDst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8);\n\ + _viv_asm(COPY, dst, tmpDst, 16);\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ + }\n\ +}\n\ +\n\ +\n\ +#define L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(in0_name, in1_name, out_name, read_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ + void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D( \\\n\ + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + int lidx = get_local_id(0); \\\n\ + read_type src0, dst; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float sum = 0; \\\n\ + vxc_float4 scale_f0, scale_f1, sumsqr; \\\n\ + __local float lcl_sum[16]; \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + for(; coord.x < inputWidth;) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 128; \\\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\ + uniInt16SumSqr_dp8x2); \\\n\ + sum += sumsqr.y - zP2x * sumsqr.x + zpSqr8x; \\\n\ + } \\\n\ + sum *= e2InScale; \\\n\ + lcl_sum[lidx] = sum; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\ + float4 data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\ + sum = dot(data0, one); \\\n\ + float alpha = rsqrt(sum) * inOutScale; \\\n\ + short zp = inputZP; \\\n\ + vxc_float4 tmpData0, tmpData1; \\\n\ + for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4); \\\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4); \\\n\ + \\\n\ + int4 tmpVal0 = convert_int4_rte(tmpData0 * scale_f0 * alpha + output_ZP); \\\n\ + int4 tmpVal1 = convert_int4_rte(tmpData1 * scale_f1 * alpha + output_ZP); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ + uniConvertInt32toUint8_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +\n\ +L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(U8, F16, U8, vxc_uchar8)\n\ +L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(I8, F16, I8, vxc_char8)\n\ +L2NORMSCALE_QINTF16TOQINT_AXIS0_2D(I16, F16, I16, vxc_short8)\n\ +\n\ +#define L2NORMSCALE_QINTF16TOF16_AXIS0_2D(in0_name, in1_name, out_name, read_type) \\\n\ +__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\ + void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D( \\\n\ + __read_only image2d_t input, __read_only image2d_t scale, __write_only image2d_t output, int axis)\\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\ + int lidx = get_local_id(0); \\\n\ + read_type src0; \\\n\ + vxc_short8 src1, dst; \\\n\ + vxc_half8 scale_h, tmpDst; \\\n\ + float sum = 0; \\\n\ + vxc_float4 scale_f0, scale_f1, sumsqr, tmpData0, tmpData1; \\\n\ + __local float lcl_sum[16]; \\\n\ + float4 one = (float4)(1, 1, 1, 1); \\\n\ + for(; coord.x < inputWidth;) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 128; \\\n\ + VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), \\\n\ + uniInt16SumSqr_dp8x2); \\\n\ + sum += sumsqr.y - zP2x * sumsqr.x + zpSqr8x; \\\n\ + } \\\n\ + sum *= e2InScale; \\\n\ + lcl_sum[lidx] = sum; \\\n\ + barrier(CLK_LOCAL_MEM_FENCE); \\\n\ + float4 *pLocalPtr = (float4 *)&lcl_sum[0]; \\\n\ + float4 data0; \\\n\ + data0 = pLocalPtr[0] + pLocalPtr[1] + pLocalPtr[2] + pLocalPtr[3]; \\\n\ + sum = dot(data0, one); \\\n\ + float alpha = rsqrt(sum) * inOutScale; \\\n\ + short zp = inputZP; \\\n\ + for(coord.x = get_global_id(0); coord.x < inputWidth; coord.x += 128) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, scale, coord.xz, VXC_5BITOFFSET_XY(0, 0),\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + UniFP16toFP32Lo4_dp4x4); \\\n\ + VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertSecFp16Fp32_4x4); \\\n\ + \\\n\ + VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert1stUint8SubZpToFp32_4x4); \\\n\ + VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvert2ndUint8SubZpToFp32_4x4); \\\n\ + \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + tmpData0 *= scale_f0 * alpha; \\\n\ + tmpData1 *= scale_f1 * alpha; \\\n\ + _viv_asm(CONV, tmpVal0, tmpData0); \\\n\ + _viv_asm(CONV, tmpVal1, tmpData1); \\\n\ + VXC_DP2x8(tmpDst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniConvertHalfToFp16_2x8); \\\n\ + _viv_asm(COPY, dst, tmpDst, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +\n\ +L2NORMSCALE_QINTF16TOF16_AXIS0_2D(U8, F16, F16, vxc_uchar8)\n\ +L2NORMSCALE_QINTF16TOF16_AXIS0_2D(I8, F16, F16, vxc_char8)\n\ +L2NORMSCALE_QINTF16TOF16_AXIS0_2D(I16, F16, F16, vxc_short8)\n\ +"; /* end of l2normalizescale_axis0_2d_vx*/ + static const char l2normalizescale_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ /********************************************L2NormalizeScale*****************************************/\n\ @@ -14444,1208 +13079,1131 @@ L2NORMSCALE_AXIS1_QNT_2D(I16, F16, F16, vxc_short8, vxc_short8, vxc_half8, h L2NORMSCALE_AXIS1_QNT_2D(I16, F16, I16, vxc_short8, vxc_short8, vxc_short8, int4, vxc_short8)\n\ "; /* end of l2normalizescale_axis1_vx*/ -static const char layer_normalization_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char layer_normalization_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -/**************************layernorm float16***********************************/\n\ +_viv_uniform VXC_512Bits uniSumX_16x1;\n\ +_viv_uniform VXC_512Bits uniSumX2_16x1;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform int width;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ -\n\ -__kernel void layer_norm_F16toF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_out = coord;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - vxc_short8 src0, src1;\n\ - vxc_float sum = 0, sqr = 0;\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.z, baseAddr);\n\ -\n\ - for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ - {\n\ - vxc_half8 val0_h;\n\ - _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr += sumsqr.y;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 bias_f;\n\ - for(coord.x = 0; coord.x < width; coord.x += 4)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord.xw);\n\ - vxc_half8 in_h, scale_h;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - vxc_float4 in_f, scale_f;\n\ - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - vxc_float4 sub, norm;\n\ - sub = in_f - mean;\n\ - norm = scale_f * vari * sub + bias_f;\n\ - half4 norm_h;\n\ - _viv_asm(CONV, norm_h, norm);\n\ - vxc_half8 dst;\n\ - VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniExtractHalf4_dp4x4);\n\ - vxc_short8 dstval;\n\ - _viv_asm(COPY, dstval, dst, 16);\n\ - coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \\\n\ - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -/*****************************layernorm uint8 to uint8****************************/\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform float outputScale;\n\ +_viv_uniform float inv_multiplier;\n\ +_viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform int tmpZp2;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ \n\ -__kernel void layer_norm_U8toU8(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_out = coord;\n\ +#define CONV2F32(dst, src, section) \\\n\ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataToFP32_##section##_4x4);\n\ \n\ - vxc_uchar16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float sum = 0, sqr = 0;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - vxc_int4 tmpSum1;\n\ - vxc_int4 tmpSqr1;\n\ - short zp = inputZP;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.z, baseAddr);\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1.x);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ - }\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ -\n\ - float mean, vari;\n\ - mean = sum * dimRatio;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - int2 coord_bias = (int2)(0, 0);\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.x = coord.x;\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ -\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - tmpData0 *= input_scale;\n\ - tmpData1 *= input_scale;\n\ - tmpData2 *= input_scale;\n\ - tmpData3 *= input_scale;\n\ -\n\ - vxc_float4 norm;\n\ - tmpData0 -= mean;\n\ - norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - coord_bias.x += 4;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - tmpData1 -= mean;\n\ - norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ -\n\ - tmpData2 -= mean;\n\ - norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - tmpData3 -= mean;\n\ - norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \\\n\ - VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ +#define LAYER_NORM_8BITS_IMPL(name, src_type) \\\n\ +__kernel void layer_norm_axis0_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type src0, dst; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float sum = 0, sqr = 0; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + uint2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.z, baseAddr); \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 16; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + int4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \\\n\ + int2 coord_bias = (int2)(0, 0); \\\n\ + \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_bias.x = coord.x; \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + \\\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + CONV2F32(tmpData2, src0, 2); \\\n\ + CONV2F32(tmpData3, src0, 3); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + coord_bias.x += 4; \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + \\\n\ + tmpData2 = tmpData2 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + \\\n\ + tmpData3 = tmpData3 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + coord_out.x = coord.x; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ }\n\ -/***************************layernorm float16 to uint8**************************/\n\ -__kernel void layer_norm_F16toU8(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_out = coord;\n\ +LAYER_NORM_8BITS_IMPL(U8_F16toU8, vxc_uchar16)\n\ +LAYER_NORM_8BITS_IMPL(I8_F16toI8, vxc_char16)\n\ \n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ +#define LAYER_NORM_SUMS_2D() \\\n\ + uint2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 16; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y);\n\ \n\ - vxc_short8 src0, src1;\n\ - vxc_float sum = 0, sqr = 0;\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +#define LAYER_NORM_8BITS_IMPL_2D(name, src_type) \\\n\ +__kernel void layer_norm_axis0_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\ + \\\n\ + src_type src0, dst; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float sum = 0, sqr = 0; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + \\\n\ + LAYER_NORM_SUMS_2D(); \\\n\ + \\\n\ + int4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \\\n\ + int2 coord_bias = (int2)(0, 0); \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_bias.x = coord.x; \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + \\\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + CONV2F32(tmpData2, src0, 2); \\\n\ + CONV2F32(tmpData3, src0, 3); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + coord_bias.x += 4; \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + \\\n\ + tmpData2 = tmpData2 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + \\\n\ + tmpData3 = tmpData3 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_8BITS_IMPL_2D(U8_F16toU8, vxc_uchar16)\n\ +LAYER_NORM_8BITS_IMPL_2D(I8_F16toI8, vxc_char16)\n\ \n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.z, baseAddr);\n\ -\n\ - for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ - {\n\ - vxc_half8 val0_h;\n\ - _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr += sumsqr.y;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 bias_f;\n\ - for(coord.x = 0; coord.x < width; coord.x += 4)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord.xw);\n\ - vxc_half8 in_h, scale_h;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - vxc_float4 in_f, scale_f;\n\ - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - vxc_float4 sub, norm;\n\ - sub = in_f - mean;\n\ - norm = scale_f * vari * sub + bias_f;\n\ - norm = norm * outputScale + output_zp;\n\ - int4 output_int4;\n\ - output_int4 = convert_int4_rte(norm);\n\ - vxc_uchar8 dst;\n\ - VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\ - uniConvertInt32toUint8_2x8);\n\ - coord_out.x = coord.x;\n\ +#define LAYER_NORM_8TOF16_IMPL(name, src_type) \\\n\ +__kernel void layer_norm_axis0_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type src0; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float sum = 0, sqr = 0; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + uint2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.z, baseAddr); \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 16; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \\\n\ + int2 coord_bias = (int2)(0, 0); \\\n\ + \\\n\ + vxc_short8 dst; \\\n\ + vxc_half8 result; \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_bias.x = coord.x; \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + \\\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + CONV2F32(tmpData2, src0, 2); \\\n\ + CONV2F32(tmpData3, src0, 3); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + coord_bias.x += 4; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + coord_out.x = coord.x; \\\n\ VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \\\n\ - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of layer_normalization_vx*/ - -static const char layer_normalization_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpData2 = tmpData2 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + \\\n\ + tmpData3 = tmpData3 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + coord_out.x += 8; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_8TOF16_IMPL(U8_F16toF16, vxc_uchar16)\n\ +LAYER_NORM_8TOF16_IMPL(I8_F16toF16, vxc_char16)\n\ \n\ -/**************************layernorm float16***********************************/\n\ +#define LAYER_NORM_8TOF16_IMPL_2D(name, src_type) \\\n\ +__kernel void layer_norm_axis0_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\ + \\\n\ + src_type src0; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float sum = 0, sqr = 0; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + \\\n\ + LAYER_NORM_SUMS_2D(); \\\n\ + \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \\\n\ + int2 coord_bias = (int2)(0, 0); \\\n\ + \\\n\ + vxc_short8 dst; \\\n\ + vxc_half8 result; \\\n\ + for (coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_bias.x = coord.x; \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + \\\n\ + VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + CONV2F32(tmpData2, src0, 2); \\\n\ + CONV2F32(tmpData3, src0, 3); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + coord_bias.x += 4; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + \\\n\ + tmpData2 = tmpData2 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + \\\n\ + tmpData3 = tmpData3 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x -= 8; \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_8TOF16_IMPL_2D(U8_F16toF16, vxc_uchar16)\n\ +LAYER_NORM_8TOF16_IMPL_2D(I8_F16toF16, vxc_char16)\n\ +"; /* end of layer_normalization_0_vx*/ + +static const char layer_normalization_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform int width;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ -\n\ -__kernel void layer_norm_F16toF16_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale,\n\ - image2d_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_short8 src0, src1;\n\ - vxc_float sum = 0, sqr = 0;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ - {\n\ - vxc_half8 val0_h;\n\ - _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr += sumsqr.y;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 bias_f;\n\ - for(coord.x = 0; coord.x < width; coord.x += 4)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord.xw);\n\ - vxc_half8 in_h, scale_h;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - vxc_float4 in_f, scale_f;\n\ - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - vxc_float4 sub, norm;\n\ - sub = in_f - mean;\n\ - norm = scale_f * vari * sub + bias_f;\n\ - half4 norm_h;\n\ - _viv_asm(CONV, norm_h, norm);\n\ - vxc_half8 dst;\n\ - VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniExtractHalf4_dp4x4);\n\ - vxc_short8 dstval;\n\ - _viv_asm(COPY, dstval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -/*****************************layernorm uint8 to uint8****************************/\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform float outputScale;\n\ +_viv_uniform float inv_multiplier;\n\ +_viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform int tmpZp2;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ \n\ -__kernel void layer_norm_U8toU8_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale,\n\ - image2d_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_uchar16 src0, src2;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - float sum = 0, sqr = 0;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - vxc_int4 tmpSum1;\n\ - vxc_int4 tmpSqr1;\n\ - short zp = inputZP;\n\ +#define CONV2F32(dst, src, section) \\\n\ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataToFP32_##section##_4x4);\n\ \n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1.x);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ - }\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ -\n\ - float mean, vari;\n\ - mean = sum * dimRatio;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - int2 coord_bias = (int2)(0, 0);\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.x = coord.x;\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ -\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - tmpData0 = tmpData0 * input_scale - mean;\n\ - tmpData1 = tmpData1 * input_scale - mean;\n\ - tmpData2 = tmpData2 * input_scale - mean;\n\ - tmpData3 = tmpData3 * input_scale - mean;\n\ -\n\ - vxc_float4 norm;\n\ - norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - coord_bias.x += 4;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ -\n\ - norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ +#define LAYER_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel void layer_norm_axis0_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + vxc_short8 in0; \\\n\ + src_type src0; \\\n\ + copy_type dst; \\\n\ + vxc_short8 src1; \\\n\ + dst_type result; \\\n\ + vxc_half8 scale_h; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + float2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.z, baseAddr); \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, in0, 16); \\\n\ + coord.x += 8; \\\n\ + VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = _sums * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + int2 coord_bias = (int2)(0, 0); \\\n\ + \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, in0, 16); \\\n\ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_bias.x = coord.x; \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + \\\n\ + coord_out.x = coord.x; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ }\n\ -/***************************layernorm float16 to uint8**************************/\n\ -__kernel void layer_norm_F16toU8_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale,\n\ - image2d_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_short8 src0, src1;\n\ - vxc_float sum = 0, sqr = 0;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ - {\n\ - vxc_half8 val0_h;\n\ - _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr += sumsqr.y;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 bias_f;\n\ - for(coord.x = 0; coord.x < width; coord.x += 4)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = read_imagef(bias, coord.xw);\n\ - vxc_half8 in_h, scale_h;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - vxc_float4 in_f, scale_f;\n\ - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - vxc_float4 sub, norm;\n\ - sub = in_f - mean;\n\ - norm = scale_f * vari * sub + bias_f;\n\ - norm = norm * outputScale + output_zp;\n\ - int4 output_int4;\n\ - output_int4 = convert_int4_rte(norm);\n\ - vxc_uchar8 dst;\n\ - VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ +LAYER_NORM_16BITS_IMPL(F16_F16toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +LAYER_NORM_16BITS_IMPL(F16_F16toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +LAYER_NORM_16BITS_IMPL(F16_F16toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +LAYER_NORM_16BITS_IMPL(F16_F16toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +LAYER_NORM_16BITS_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +LAYER_NORM_16BITS_IMPL(I16_F16toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +\n\ +#define LAYER_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel void layer_norm_axis0_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\ + \\\n\ + vxc_short8 in0; \\\n\ + src_type src0; \\\n\ + copy_type dst; \\\n\ + dst_type result; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + float2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, in0, 16); \\\n\ + coord.x += 8; \\\n\ + VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = _sums * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + int2 coord_bias = (int2)(0, 0); \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, in0, 16); \\\n\ + VXC_ReadImage(src1, scale, coord.xw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_bias.x = coord.x; \\\n\ + _viv_asm(COPY, scale_h, src1, 16); \\\n\ + CONV2F32(scale_f0, scale_h, 0); \\\n\ + CONV2F32(scale_f1, scale_h, 1); \\\n\ + bias_f0 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + bias_f1 = read_imagef(bias, coord_bias); \\\n\ + coord_bias.x += 4; \\\n\ + \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + coord_bias.x += 4; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ }\n\ -"; /* end of layer_normalization_2d_vx*/ +LAYER_NORM_16BITS_IMPL_2D(F16_F16toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +LAYER_NORM_16BITS_IMPL_2D(F16_F16toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +LAYER_NORM_16BITS_IMPL_2D(F16_F16toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +LAYER_NORM_16BITS_IMPL_2D(F16_F16toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +LAYER_NORM_16BITS_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +LAYER_NORM_16BITS_IMPL_2D(I16_F16toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +\n\ +#define LAYER_NORM_16_32_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel void layer_norm_axis0_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + vxc_short8 in0; \\\n\ + src_type src0; \\\n\ + copy_type dst; \\\n\ + dst_type result; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + float2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.z, baseAddr); \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, in0, 16); \\\n\ + coord.x += 8; \\\n\ + VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = _sums * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(bias, 4); \\\n\ + Image img2 = create_image_from_image2d(scale, 4); \\\n\ + __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ + __global float* scale_ptr = (__global float*)img2.ptr; \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, in0, 16); \\\n\ + bias_f0 = vload4(0, bias_ptr); \\\n\ + bias_f1 = vload4(1, bias_ptr); \\\n\ + scale_f0 = vload4(0, scale_ptr); \\\n\ + scale_f1 = vload4(1, scale_ptr); \\\n\ + bias_ptr += 8; \\\n\ + scale_ptr += 8; \\\n\ + \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + \\\n\ + coord_out.x = coord.x; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_16_32_IMPL(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +LAYER_NORM_16_32_IMPL(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +LAYER_NORM_16_32_IMPL(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +LAYER_NORM_16_32_IMPL(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +LAYER_NORM_16_32_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +LAYER_NORM_16_32_IMPL(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +\n\ +#define LAYER_NORM_16_32_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\ +__kernel void layer_norm_axis0_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\ + \\\n\ + vxc_short8 in0; \\\n\ + src_type src0; \\\n\ + copy_type dst; \\\n\ + dst_type result; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + float2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, in0, 16); \\\n\ + coord.x += 8; \\\n\ + VXC_DP8x2(sum_x_x2, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniSum_X_X2_8x2); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = _sums * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1; \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(bias, 4); \\\n\ + Image img2 = create_image_from_image2d(scale, 4); \\\n\ + __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ + __global float* scale_ptr = (__global float*)img2.ptr; \\\n\ + for (coord.x = 0; coord.x < width; coord.x += 8) \\\n\ + { \\\n\ + VXC_ReadImage(in0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, src0, in0, 16); \\\n\ + bias_f0 = vload4(0, bias_ptr); \\\n\ + bias_f1 = vload4(1, bias_ptr); \\\n\ + scale_f0 = vload4(0, scale_ptr); \\\n\ + scale_f1 = vload4(1, scale_ptr); \\\n\ + bias_ptr += 8; \\\n\ + scale_ptr += 8; \\\n\ + \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + norm = norm * output_scale + output_zp; \\\n\ + _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_16_32_IMPL_2D(F16_F32toF16, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +LAYER_NORM_16_32_IMPL_2D(F16_F32toI16, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +LAYER_NORM_16_32_IMPL_2D(F16_F32toI8, vxc_half8, vxc_char8, vxc_char8, int4)\n\ +LAYER_NORM_16_32_IMPL_2D(F16_F32toU8, vxc_half8, vxc_uchar8, vxc_uchar8, int4)\n\ +LAYER_NORM_16_32_IMPL_2D(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +LAYER_NORM_16_32_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +"; /* end of layer_normalization_1_vx*/ -static const char layer_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char layer_normalization_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ -/**************************layernorm float16***********************************/\n\ +_viv_uniform VXC_512Bits uniSumX_16x1;\n\ +_viv_uniform VXC_512Bits uniSumX2_16x1;\n\ +_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\ +_viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform int width;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform float dimRatio_scale;\n\ -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform float outputScale;\n\ +_viv_uniform float inv_multiplier;\n\ +_viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ \n\ -__kernel void layer_norm_I16toI16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ +#define CONV2F32(dst, src, section) \\\n\ + VXC_DP4x4(dst, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataToFP32_##section##_4x4);\n\ \n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_float sum = 0, sqr = 0;\n\ - for(; coord_in.x < width;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x += 8;\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr = sqr + sumsqr.y * e2InScale;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio_scale;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_half8 scale_h;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - int2 coord_bias = (int2)(0, 0);\n\ -\n\ - for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.x = coord_in.x;\n\ - VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - vxc_float4 sub, norm;\n\ - sub = tmpData0 * input_scale - mean;\n\ - norm = scale_f0 * vari * sub + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - sub = tmpData1 * input_scale - mean;\n\ - norm = scale_f1 * vari * sub + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, dst, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ +#define LAYER_NORM_8_32_IMPL(name, src_type) \\\n\ +__kernel void layer_norm_axis0_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type src0, dst; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float sum = 0, sqr = 0; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + uint2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.z, baseAddr); \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 16; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + int4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(bias, 4); \\\n\ + Image img2 = create_image_from_image2d(scale, 4); \\\n\ + __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ + __global float* scale_ptr = (__global float*)img2.ptr; \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f0 = vload4(0, bias_ptr); \\\n\ + bias_f1 = vload4(1, bias_ptr); \\\n\ + scale_f0 = vload4(0, scale_ptr); \\\n\ + scale_f1 = vload4(1, scale_ptr); \\\n\ + \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + CONV2F32(tmpData2, src0, 2); \\\n\ + CONV2F32(tmpData3, src0, 3); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + bias_f0 = vload4(2, bias_ptr); \\\n\ + scale_f0 = vload4(2, scale_ptr); \\\n\ + \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + bias_f1 = vload4(3, bias_ptr); \\\n\ + scale_f1 = vload4(3, scale_ptr); \\\n\ + bias_ptr += 16; \\\n\ + scale_ptr += 16; \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + \\\n\ + tmpData2 = tmpData2 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + \\\n\ + tmpData3 = tmpData3 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + coord_out.x = coord.x; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ }\n\ +LAYER_NORM_8_32_IMPL(U8_F32toU8, vxc_uchar16)\n\ +LAYER_NORM_8_32_IMPL(I8_F32toI8, vxc_char16)\n\ \n\ -__kernel void layer_norm_I16toI16_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale,\n\ - image2d_t output, float eps)\n\ -{\n\ - int2 coord = (int2)(0, get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_float sum = 0, sqr = 0;\n\ - for(; coord.x < width;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr = sqr + sumsqr.y * e2InScale;\n\ - }\n\ - vxc_float mean, vari;\n\ - mean = sum * dimRatio_scale;\n\ - vari = sqr * dimRatio - mean * mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_half8 scale_h;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - int2 coord_bias = (int2)(0, 0);\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 8)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.x = coord.x;\n\ - VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - vxc_float4 sub, norm;\n\ - sub = tmpData0 * input_scale - mean;\n\ - norm = scale_f0 * vari * sub + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - sub = tmpData1 * input_scale - mean;\n\ - norm = scale_f1 * vari * sub + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ +#define LAYER_NORM_8_32_IMPL_2D(name, src_type) \\\n\ +__kernel void layer_norm_axis0_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\ + \\\n\ + src_type src0, dst; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float sum = 0, sqr = 0; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + uint2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 16; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + int4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \\\n\ + \\\n\ + Image img1 = create_image_from_image2d(bias, 4); \\\n\ + Image img2 = create_image_from_image2d(scale, 4); \\\n\ + __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ + __global float* scale_ptr = (__global float*)img2.ptr; \\\n\ + for (coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f0 = vload4(0, bias_ptr); \\\n\ + bias_f1 = vload4(1, bias_ptr); \\\n\ + scale_f0 = vload4(0, scale_ptr); \\\n\ + scale_f1 = vload4(1, scale_ptr); \\\n\ + \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + CONV2F32(tmpData2, src0, 2); \\\n\ + CONV2F32(tmpData3, src0, 3); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + bias_f0 = vload4(2, bias_ptr); \\\n\ + scale_f0 = vload4(2, scale_ptr); \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + \\\n\ + bias_f1 = vload4(3, bias_ptr); \\\n\ + scale_f1 = vload4(3, scale_ptr); \\\n\ + bias_ptr += 16; \\\n\ + scale_ptr += 16; \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + \\\n\ + tmpData2 = tmpData2 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\ + tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + \\\n\ + tmpData3 = tmpData3 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\ + tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\ + VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ }\n\ -"; /* end of layer_normalization_i16_vx*/ +LAYER_NORM_8_32_IMPL_2D(U8_F32toU8, vxc_uchar16)\n\ +LAYER_NORM_8_32_IMPL_2D(I8_F32toI8, vxc_char16)\n\ +\n\ +#define LAYER_NORM_8_32TOF16_IMPL(name, src_type) \\\n\ +__kernel void layer_norm_axis0_##name( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + int4 coord_out = coord; \\\n\ + \\\n\ + src_type src0; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float sum = 0, sqr = 0; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + uint2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + int8 input_desc, output_desc; \\\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\ + int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0; \\\n\ + _viv_asm(MOV, coord.z, baseAddr_a); \\\n\ + \\\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ + int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ + _viv_asm(MOV, coord_out.z, baseAddr); \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 16; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \\\n\ + \\\n\ + vxc_short8 dst; \\\n\ + vxc_half8 result; \\\n\ + Image img1 = create_image_from_image2d(bias, 4); \\\n\ + Image img2 = create_image_from_image2d(scale, 4); \\\n\ + __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ + __global float* scale_ptr = (__global float*)img2.ptr; \\\n\ + for(coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_OP4(img_load_3d, src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f0 = vload4(0, bias_ptr); \\\n\ + bias_f1 = vload4(1, bias_ptr); \\\n\ + scale_f0 = vload4(0, scale_ptr); \\\n\ + scale_f1 = vload4(1, scale_ptr); \\\n\ + \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + CONV2F32(tmpData2, src0, 2); \\\n\ + CONV2F32(tmpData3, src0, 3); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + bias_f0 = vload4(2, bias_ptr); \\\n\ + scale_f0 = vload4(2, scale_ptr); \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + \\\n\ + bias_f1 = vload4(3, bias_ptr); \\\n\ + scale_f1 = vload4(3, scale_ptr); \\\n\ + bias_ptr += 16; \\\n\ + scale_ptr += 16; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + coord_out.x = coord.x; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \\\n\ + VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + tmpData2 = tmpData2 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + \\\n\ + tmpData3 = tmpData3 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + coord_out.x += 8; \\\n\ + VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_8_32TOF16_IMPL(U8_F32toF16, vxc_uchar16)\n\ +LAYER_NORM_8_32TOF16_IMPL(I8_F32toF16, vxc_char16)\n\ +\n\ +#define LAYER_NORM_8_32TOF16_IMPL_2D(name, src_type) \\\n\ +__kernel void layer_norm_axis0_##name##_2D( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_t bias, \\\n\ + __read_only image2d_t scale, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps) \\\n\ +{ \\\n\ + int4 coord = (int4)(0, get_global_id(1), 0, 0); \\\n\ + \\\n\ + src_type src0; \\\n\ + vxc_short8 src1; \\\n\ + vxc_half8 scale_h; \\\n\ + float sum = 0, sqr = 0; \\\n\ + float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\ + uint2 _sums = 0, sum_x_x2; \\\n\ + \\\n\ + for (coord.x = 0; coord.x < width; ) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 16; \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1); \\\n\ + VXC_DP16x1(sum_x_x2, src0, src0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1); \\\n\ + _sums = _sums + sum_x_x2; \\\n\ + } \\\n\ + \\\n\ + float2 sums = convert_float2(_sums) * inv_multiplier; \\\n\ + \\\n\ + sums.y = sums.y - sums.x * sums.x + eps; \\\n\ + sums.y = rsqrt(sums.y); \\\n\ + half4 tmpVal0, tmpVal1; \\\n\ + float4 tmpData0, tmpData1, tmpData2, tmpData3; \\\n\ + \\\n\ + vxc_short8 dst; \\\n\ + vxc_half8 result; \\\n\ + Image img1 = create_image_from_image2d(bias, 4); \\\n\ + Image img2 = create_image_from_image2d(scale, 4); \\\n\ + __global float* bias_ptr = (__global float*)img1.ptr; \\\n\ + __global float* scale_ptr = (__global float*)img2.ptr; \\\n\ + for (coord.x = 0; coord.x < width; coord.x += 16) \\\n\ + { \\\n\ + VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + bias_f0 = vload4(0, bias_ptr); \\\n\ + bias_f1 = vload4(1, bias_ptr); \\\n\ + scale_f0 = vload4(0, scale_ptr); \\\n\ + scale_f1 = vload4(1, scale_ptr); \\\n\ + bias_ptr += 8; \\\n\ + scale_ptr += 8; \\\n\ + \\\n\ + CONV2F32(tmpData0, src0, 0); \\\n\ + CONV2F32(tmpData1, src0, 1); \\\n\ + CONV2F32(tmpData2, src0, 2); \\\n\ + CONV2F32(tmpData3, src0, 3); \\\n\ + \\\n\ + float4 norm; \\\n\ + tmpData0 = tmpData0 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData0 + bias_f0; \\\n\ + bias_f0 = vload4(2, bias_ptr); \\\n\ + scale_f0 = vload4(2, scale_ptr); \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + \\\n\ + tmpData1 = tmpData1 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData1 + bias_f1; \\\n\ + bias_f1 = vload4(3, bias_ptr); \\\n\ + scale_f1 = vload4(3, scale_ptr); \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x += 8; \\\n\ + \\\n\ + tmpData2 = tmpData2 - sums.x; \\\n\ + norm = scale_f0 * sums.y * tmpData2 + bias_f0; \\\n\ + _viv_asm(CONV, tmpVal0, norm); \\\n\ + \\\n\ + tmpData3 = tmpData3 - sums.x; \\\n\ + norm = scale_f1 * sums.y * tmpData3 + bias_f1; \\\n\ + _viv_asm(CONV, tmpVal1, norm); \\\n\ + VXC_DP2x8(result, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ + VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord.x -= 8; \\\n\ + } \\\n\ +}\n\ +LAYER_NORM_8_32TOF16_IMPL_2D(U8_F32toF16, vxc_uchar16)\n\ +LAYER_NORM_8_32TOF16_IMPL_2D(I8_F32toF16, vxc_char16)\n\ +"; /* end of layer_normalization_2_vx*/ -static const char layer_normalization_scale_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -/**************************layernorm float16***********************************/\n\ -_viv_uniform int width;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ -\n\ -__kernel void layer_norm_F16F32toF16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_out = coord;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - vxc_short8 src0;\n\ - vxc_float sum = 0, sqr = 0;\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.z, baseAddr);\n\ -\n\ - for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ - {\n\ - vxc_half8 val0_h;\n\ - _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr += sumsqr.y;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 bias_f, scale_f, in_f;\n\ - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));\n\ - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));\n\ - for(coord.x = 0; coord.x < width; coord.x += 4)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = vload4(0, bias_ptr + coord.x);\n\ - scale_f = vload4(0, scale_ptr + coord.x);\n\ - vxc_half8 in_h;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - vxc_float4 sub, norm;\n\ - sub = in_f - mean;\n\ - norm = scale_f * vari * sub + bias_f;\n\ - half4 norm_h;\n\ - _viv_asm(CONV, norm_h, norm);\n\ - vxc_half8 dst;\n\ - VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniExtractHalf4_dp4x4);\n\ - vxc_short8 dstval;\n\ - _viv_asm(COPY, dstval, dst, 16);\n\ - coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \\\n\ - VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -/*****************************layernorm uint8 to uint8****************************/\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform float output_zp;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform int tmpZp2;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ -_viv_uniform float dimRatio_scale;\n\ -\n\ -__kernel void layer_norm_U8F32toU8(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_out = coord;\n\ -\n\ - vxc_uchar16 src0, src2;\n\ - float sum = 0, sqr = 0;\n\ - vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - vxc_int4 tmpSum1;\n\ - vxc_int4 tmpSqr1;\n\ - short zp = inputZP;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.z, baseAddr);\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1.x);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ - }\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ -\n\ - float mean, vari;\n\ - mean = sum * dimRatio;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ -\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));\n\ - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = vload4(0, bias_ptr);\n\ - bias_f1 = vload4(1, bias_ptr);\n\ - bias_f2 = vload4(2, bias_ptr);\n\ - bias_f3 = vload4(3, bias_ptr);\n\ - scale_f0 = vload4(0, scale_ptr);\n\ - scale_f1 = vload4(1, scale_ptr);\n\ - scale_f2 = vload4(2, scale_ptr);\n\ - scale_f3 = vload4(3, scale_ptr);\n\ - bias_ptr += 16;\n\ - scale_ptr += 16;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - tmpData0 *= input_scale;\n\ - tmpData1 *= input_scale;\n\ - tmpData2 *= input_scale;\n\ - tmpData3 *= input_scale;\n\ -\n\ - vxc_float4 norm;\n\ - tmpData0 -= mean;\n\ - norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - tmpData1 -= mean;\n\ - norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ -\n\ - tmpData2 -= mean;\n\ - norm = scale_f2 * vari * tmpData2 + bias_f2;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - tmpData3 -= mean;\n\ - norm = scale_f3 * vari * tmpData3 + bias_f3;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \\\n\ - VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel void layer_norm_I16F32toI16(\n\ - image2d_array_t input, image2d_t bias, image2d_t scale,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - vxc_short8 src0, dst;\n\ - vxc_float sum = 0, sqr = 0;\n\ - for(; coord_in.x < width;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.x += 8;\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr = sqr + sumsqr.y * e2InScale;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio_scale;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - int2 coord_bias = (int2)(0, 0);\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias);\n\ - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias);\n\ - for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = vload4(0, bias_ptr);\n\ - bias_f1 = vload4(1, bias_ptr);\n\ - scale_f0 = vload4(0, scale_ptr);\n\ - scale_f1 = vload4(1, scale_ptr);\n\ - bias_ptr += 8;\n\ - scale_ptr += 8;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ -\n\ - vxc_float4 sub, norm;\n\ - sub = tmpData0 * input_scale - mean;\n\ - norm = scale_f0 * vari * sub + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - sub = tmpData1 * input_scale - mean;\n\ - norm = scale_f1 * vari * sub + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, dst, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of layer_normalization_scale_f32_vx*/ - -static const char layer_normalization_scale_f32_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -/**************************layernorm float16***********************************/\n\ -_viv_uniform int width;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\ -\n\ -__kernel void layer_norm_F16F32toF16_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale,\n\ - image2d_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_short8 src0, src1;\n\ - vxc_float sum = 0, sqr = 0;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ -\n\ - for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\ - {\n\ - vxc_half8 val0_h;\n\ - _viv_asm(COPY, val0_h, src0, 16);\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr += sumsqr.y;\n\ - }\n\ - vxc_float mean;\n\ - mean = sum * dimRatio;\n\ - vxc_float vari;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 bias_f, scale_f, in_f;\n\ - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\ - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\ - for(coord.x = 0; coord.x < width; coord.x += 4)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - bias_f = vload4(0, bias_ptr + coord.x);\n\ - scale_f = vload4(0, scale_ptr + coord.x);\n\ -\n\ - vxc_half8 in_h;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - vxc_float4 sub, norm;\n\ - sub = in_f - mean;\n\ - norm = scale_f * vari * sub + bias_f;\n\ - half4 norm_h;\n\ - _viv_asm(CONV, norm_h, norm);\n\ - vxc_half8 dst;\n\ - VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniExtractHalf4_dp4x4);\n\ - vxc_short8 dstval;\n\ - _viv_asm(COPY, dstval, dst, 16);\n\ - VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -/*****************************layernorm uint8 to uint8****************************/\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform float output_zp;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform int tmpZp2;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ -_viv_uniform float dimRatio_scale;\n\ -\n\ -__kernel void layer_norm_U8F32toU8_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale,\n\ - image2d_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_uchar16 src0, src2;\n\ - float sum = 0, sqr = 0;\n\ - vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - vxc_int4 tmpSum1;\n\ - vxc_int4 tmpSqr1;\n\ - short zp = inputZP;\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1.x);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ - }\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ -\n\ - float mean, vari;\n\ - mean = sum * dimRatio;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ -\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\ - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = vload4(0, bias_ptr);\n\ - bias_f1 = vload4(1, bias_ptr);\n\ - bias_f2 = vload4(2, bias_ptr);\n\ - bias_f3 = vload4(3, bias_ptr);\n\ - scale_f0 = vload4(0, scale_ptr);\n\ - scale_f1 = vload4(1, scale_ptr);\n\ - scale_f2 = vload4(2, scale_ptr);\n\ - scale_f3 = vload4(3, scale_ptr);\n\ - bias_ptr += 16;\n\ - scale_ptr += 16;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - tmpData0 = tmpData0 * input_scale - mean;\n\ - tmpData1 = tmpData1 * input_scale - mean;\n\ - tmpData2 = tmpData2 * input_scale - mean;\n\ - tmpData3 = tmpData3 * input_scale - mean;\n\ -\n\ - vxc_float4 norm;\n\ - norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ -\n\ - norm = scale_f2 * vari * tmpData2 + bias_f2;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - norm = scale_f3 * vari * tmpData3 + bias_f3;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel void layer_norm_I16F32toI16_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale,\n\ - image2d_t output, float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_float sum = 0, sqr = 0;\n\ - for(; coord.x < width;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.x += 8;\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - sum += sumsqr.x;\n\ - sqr = sqr + sumsqr.y * e2InScale;\n\ - }\n\ - vxc_float mean, vari;\n\ - mean = sum * dimRatio_scale;\n\ - vari = sqr * dimRatio - mean * mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_half8 scale_h;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - Image img1 = create_image_from_image2d(bias, 4);\n\ - Image img2 = create_image_from_image2d(scale, 4);\n\ -\n\ - __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\ - __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\ - for(coord.x = 0; coord.x < width; coord.x += 8)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = vload4(0, bias_ptr);\n\ - bias_f1 = vload4(1, bias_ptr);\n\ - scale_f0 = vload4(0, scale_ptr);\n\ - scale_f1 = vload4(1, scale_ptr);\n\ - bias_ptr += 8;\n\ - scale_ptr += 8;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ -\n\ - vxc_float4 sub, norm;\n\ - sub = tmpData0 * input_scale - mean;\n\ - norm = scale_f0 * vari * sub + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - sub = tmpData1 * input_scale - mean;\n\ - norm = scale_f1 * vari * sub + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of layer_normalization_scale_f32_2d_vx*/ - -static const char layer_normalization_scale_f32_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char layer_normalization_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ /**************************layernorm float16***********************************/\n\ _viv_uniform int width;\n\ @@ -15803,1372 +14361,7 @@ __kernel void layer_norm_BF16F32toBF16_2D(\n\ VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ -}"; /* end of layer_normalization_scale_f32_bf16_vx*/ - -static const char layer_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -/*****************************layernorm uint8 to fp16****************************/\n\ -_viv_uniform int width;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int inputZP;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform int tmpZp2;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform VXC_512Bits UniPackFP16even_2x8;\n\ -\n\ -__kernel void layer_norm_U8toF16(\n\ - image2d_array_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_array_t output,\n\ - float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));\n\ - int4 coord_out = coord;\n\ - vxc_uchar16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - vxc_int4 tmpSum1;\n\ - vxc_int4 tmpSqr1;\n\ -\n\ - int8 input_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord_out.z, baseAddr);\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1.x);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ - }\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ -\n\ - float mean, vari;\n\ - mean = sum * dimRatio;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - int2 coord_bias = (int2)(0, 0);\n\ - vxc_half8 scale_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_short8 src1, outval;\n\ - short zp = inputZP;\n\ - half4 tmpVal0, tmpVal1;\n\ - vxc_half8 dst;\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.x = coord.x;\n\ -\n\ - scale_f0 = read_imagef(scale, coord_bias);\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - scale_f1 = read_imagef(scale, coord_bias);\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - tmpData0 *= input_scale;\n\ - tmpData1 *= input_scale;\n\ - tmpData2 *= input_scale;\n\ - tmpData3 *= input_scale;\n\ -\n\ - vxc_float4 norm;\n\ - tmpData0 -= mean;\n\ - norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ -\n\ - scale_f0 = read_imagef(scale, coord_bias);\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ -\n\ - tmpData1 -= mean;\n\ - norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ -\n\ - scale_f1 = read_imagef(scale, coord_bias);\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ -\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - UniPackFP16even_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - coord_out.x = coord.x;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ -\n\ - tmpData2 -= mean;\n\ - norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ -\n\ - tmpData3 -= mean;\n\ - norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - UniPackFP16even_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - coord_out.x += 8;\n\ - VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel void layer_norm_U8toF16_2D(\n\ - image2d_t input,\n\ - image2d_t bias,\n\ - image2d_t scale,\n\ - image2d_t output,\n\ - float eps)\n\ -{\n\ - int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\ - vxc_uchar16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - vxc_int4 tmpSum1;\n\ - vxc_int4 tmpSqr1;\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1.x);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\ - }\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - sqr = (tmpSqr + tmpZp2) * e2InScale;\n\ -\n\ - float mean, vari;\n\ - mean = sum * dimRatio;\n\ - vari = sqr*dimRatio - mean*mean;\n\ - vari += eps;\n\ - vari = rsqrt(vari);\n\ - vxc_float4 tmpData0, tmpData1, tmpData2, tmpData3;\n\ - int2 coord_bias = (int2)(0, 0);\n\ - vxc_half8 scale_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_short8 src1, outval;\n\ - short zp = inputZP;\n\ - half4 tmpVal0, tmpVal1;\n\ - vxc_half8 dst;\n\ -\n\ - int2 coord_out = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - for(coord.x = 0; coord.x < width; coord.x += 16)\n\ - {\n\ - coord_bias.x = coord.x;\n\ - VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - scale_f0 = read_imagef(scale, coord_bias);\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - scale_f1 = read_imagef(scale, coord_bias);\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ -\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert3rdUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert4thUint8SubZpToFp32_4x4);\n\ - tmpData0 *= input_scale;\n\ - tmpData1 *= input_scale;\n\ - tmpData2 *= input_scale;\n\ - tmpData3 *= input_scale;\n\ -\n\ - vxc_float4 norm;\n\ - tmpData0 -= mean;\n\ - norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ -\n\ - scale_f0 = read_imagef(scale, coord_bias);\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ -\n\ - coord_bias.x += 4;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ -\n\ - tmpData1 -= mean;\n\ - norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ -\n\ - scale_f1 = read_imagef(scale, coord_bias);\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ -\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - UniPackFP16even_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - coord_out.x = coord.x;\n\ - VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -\n\ - tmpData2 -= mean;\n\ - norm = scale_f0 * vari * tmpData2 + bias_f0;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ -\n\ - tmpData3 -= mean;\n\ - norm = scale_f1 * vari * tmpData3 + bias_f1;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - UniPackFP16even_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - coord_out.x += 8;\n\ - VXC_WriteImage(output, coord_out, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -"; /* end of layer_normalization_u8_f16_vx*/ - -static const char layer_normalization_wh_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\ -_viv_uniform int width;\n\ -\n\ -_viv_uniform int height;\n\ -\n\ -_viv_uniform int height_depth;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform float output_zp;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32(\n\ - image2d_array_t input, image2d_t output)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - vxc_float4 sumsqr;\n\ - vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr_a);\n\ -\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - tmpSumSqr += sumsqr;\n\ - }\n\ - }\n\ -\n\ - lcl_sum[lidx] = tmpSumSqr.x;\n\ - lcl_sqr[lidx] = tmpSumSqr.y;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - float sum = 0;\n\ - float sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_F16toF32_2D(\n\ - image2d_array_t input, image2d_t output)\n\ -{\n\ - int gidx = get_global_id(0) << 3;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ -\n\ - int2 coord = (int2)(gidx, gidy);\n\ - vxc_short8 src0;\n\ - vxc_half8 in_h;\n\ - vxc_float4 sumsqr;\n\ - vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int endH = gidy + height;\n\ - if(gidx < width)\n\ - {\n\ - for(; coord.y < endH;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniFp16SumSqr_dp8x2);\n\ - tmpSumSqr += sumsqr;\n\ - }\n\ - }\n\ -\n\ - lcl_sum[lidx] = tmpSumSqr.x;\n\ - lcl_sqr[lidx] = tmpSumSqr.y;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - float sum = 0;\n\ - float sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int2 coord_sum = (int2)(0, gidz);\n\ - int4 coord_para = coord;\n\ - coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_sum);\n\ - coord_sum.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - int4 coord_bias = coord_para;\n\ -\n\ - int8 input_desc, scale_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ - _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - vxc_half8 dst;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - coord_para.y = coord.y;\n\ - coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - vxc_float4 sub, norm;\n\ - sub = tmpData0 - mean_vari.s0;\n\ - norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - sub = tmpData1 - mean_vari.s0;\n\ - norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toF16_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_t output, float eps)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), 0);\n\ - int2 coord_bias = (int2)(0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_bias);\n\ - coord_bias.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - coord_bias = coord;\n\ -\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_short8 outval;\n\ - half4 tmpVal0, tmpVal1;\n\ - vxc_half8 dst;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.y = coord.y;\n\ - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - vxc_float4 sub, norm;\n\ - sub = tmpData0 - mean_vari.s0;\n\ - norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - sub = tmpData1 - mean_vari.s0;\n\ - norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int2 coord_sum = (int2)(0, gidz);\n\ - int4 coord_para = coord;\n\ - coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_sum);\n\ - coord_sum.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - int4 coord_bias = coord_para;\n\ -\n\ - int8 input_desc, scale_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ - _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_uchar16 outval;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - coord_para.y = coord.y;\n\ - coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - vxc_float4 sub, norm;\n\ - sub = tmpData0 - mean_vari.s0;\n\ - norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - sub = tmpData1 - mean_vari.s0;\n\ - norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16toU8_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_t output, float eps)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), 0);\n\ - int2 coord_bias = (int2)(0, 0);\n\ - vxc_short8 src0;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h, in_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_bias);\n\ - coord_bias.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - coord_bias = coord;\n\ -\n\ - vxc_float4 tmpData0, tmpData1;\n\ - vxc_uchar16 outval;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.y = coord.y;\n\ - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, in_h, src0, 16);\n\ - VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ -\n\ - vxc_float4 sub, norm;\n\ - sub = tmpData0 - mean_vari.s0;\n\ - norm = scale_f0 * mean_vari.s1 * sub + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - sub = tmpData1 - mean_vari.s0;\n\ - norm = scale_f1 * mean_vari.s1 * sub + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of layer_normalization_wh_f16_vx*/ - -static const char layer_normalization_wh_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform int width;\n\ -\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int height;\n\ -\n\ -_viv_uniform int height_depth;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform float output_zp;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform int inputZP;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32(\n\ - image2d_array_t input, image2d_t output)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ - vxc_short8 src0;\n\ - float4 tmpSumSqr = (float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr_a);\n\ -\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - tmpSumSqr += sumsqr;\n\ - }\n\ - tmpSumSqr.x *= input_scale;\n\ - tmpSumSqr.y *= e2InScale;\n\ - }\n\ - lcl_sum[lidx] = tmpSumSqr.x;\n\ - lcl_sqr[lidx] = tmpSumSqr.y;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ - float4 data = (float4)(0);\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - data.x += dot(tmp_sum[i], one);\n\ - data.y += dot(tmp_sqr[i], one);\n\ - }\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_I16toF32_2D(\n\ - image2d_t input, image2d_t output)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ -\n\ - int2 coord = (int2)(gidx, gidy);\n\ - vxc_short8 src0;\n\ - float4 tmpSumSqr = (float4)(0);\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int endH = gidy + height;\n\ - if(gidx < width)\n\ - {\n\ - for(; coord.y < endH;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - vxc_float4 sumsqr;\n\ - VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\ - uniInt16SumSqr_dp8x2);\n\ - tmpSumSqr += sumsqr;\n\ - }\n\ - tmpSumSqr.x *= input_scale;\n\ - tmpSumSqr.y *= e2InScale;\n\ - }\n\ - lcl_sum[lidx] = tmpSumSqr.x;\n\ - lcl_sqr[lidx] = tmpSumSqr.y;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ - float4 data = (float4)(0);\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - data.x += dot(tmp_sum[i], one);\n\ - data.y += dot(tmp_sqr[i], one);\n\ - }\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int2 coord_sum = (int2)(0, gidz);\n\ - int4 coord_para = coord;\n\ - coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ - vxc_short8 src0, src1, outval;\n\ - vxc_half8 scale_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_sum);\n\ - coord_sum.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - int4 coord_bias = coord_para;\n\ -\n\ - int8 input_desc, scale_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ - _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, norm;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - coord_para.y = coord.y;\n\ - coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ - tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ -\n\ - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16toI16_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_t output, float eps)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), 0);\n\ - int2 coord_bias = (int2)(0, 0);\n\ - vxc_short8 src0, src1, outval;\n\ - vxc_half8 scale_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_bias);\n\ - coord_bias.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - coord_bias = coord;\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, norm;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.y = coord.y;\n\ - VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ - tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ -\n\ - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of layer_normalization_wh_i16_vx*/ - -static const char layer_normalization_wh_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniSumU8_16x1;\n\ -_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\ -_viv_uniform int sumInZp;\n\ -_viv_uniform int tmpZp1;\n\ -_viv_uniform float e2InScale;\n\ -_viv_uniform float rowSumScale;\n\ -_viv_uniform int width;\n\ -\n\ -_viv_uniform float input_scale;\n\ -_viv_uniform int height;\n\ -\n\ -_viv_uniform int height_depth;\n\ -_viv_uniform float dimRatio;\n\ -_viv_uniform int group_num;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -_viv_uniform float outputScale;\n\ -_viv_uniform float output_zp;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ -_viv_uniform int inputZP;\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32(\n\ - image2d_array_t input, image2d_t output)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ - vxc_uchar16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int8 input_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord.w, baseAddr_a);\n\ -\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ - }\n\ - sqr += (tmpSqr * e2InScale + rowSumScale);\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSqr_U8toF32_2D(\n\ - image2d_t input, image2d_t output)\n\ -{\n\ - int gidx = get_global_id(0) << 4;\n\ - int lidx = get_local_id(0);\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ -\n\ - int2 coord = (int2)(gidx, gidy);\n\ - vxc_uchar16 src0;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int endH = gidy + height;\n\ - if(gidx < width)\n\ - {\n\ - for(; coord.y < endH;)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, 0,\n\ - VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - coord.y++;\n\ - VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\ - tmpSum += (tmpSum1);\n\ - VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\ - tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\ - }\n\ - sqr += (tmpSqr * e2InScale + rowSumScale);\n\ - sum = (tmpSum + sumInZp) * input_scale;\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ - float4 data = (float4)(sum, sqr, 0, 0);\n\ - write_imagef(output, coord_out, data);\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int2 coord_sum = (int2)(0, gidz);\n\ - int4 coord_para = coord;\n\ - coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ - vxc_uchar16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_sum);\n\ - coord_sum.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - int4 coord_bias = coord_para;\n\ -\n\ - int8 input_desc, scale_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ - _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - coord_para.y = coord.y; coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ - tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ -\n\ - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF16_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_t output, float eps)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), 0);\n\ - int2 coord_bias = (int2)(0, 0);\n\ - vxc_uchar16 src0;\n\ - vxc_short8 src1, outval;\n\ - vxc_half8 scale_h, dst;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_bias);\n\ - coord_bias.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - coord_bias = coord;\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, norm;\n\ - half4 tmpVal0, tmpVal1;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, 0,\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.y = coord.y;\n\ - VXC_ReadImage(src1, scale, coord, 0,\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ - tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ -\n\ - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ - _viv_asm(CONV, tmpVal0, norm);\n\ - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ - _viv_asm(CONV, tmpVal1, norm);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, outval, dst, 16);\n\ - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8(\n\ - image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\ - image2d_array_t output, float eps)\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\ - int2 coord_sum = (int2)(0, gidz);\n\ - int4 coord_para = coord;\n\ - coord_para.z = (ushort)gidz / (ushort)(height_depth);\n\ - vxc_uchar16 src0 , outval;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_sum);\n\ - coord_sum.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - int4 coord_bias = coord_para;\n\ -\n\ - int8 input_desc, scale_desc, output_desc;\n\ - _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ - int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\ - _viv_asm(MOV, coord_in.z, baseAddr_a);\n\ -\n\ - _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\ - int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\ - _viv_asm(MOV, coord_para.w, baseAddr_c);\n\ -\n\ - _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ - int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ - _viv_asm(MOV, coord.z, baseAddr);\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, norm;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_in.y ++;\n\ - coord_para.y = coord.y;\n\ - coord_bias.y = coord.y;\n\ - VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ - tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ -\n\ - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\ - VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\ - }\n\ -}\n\ -\n\ -__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU8_2D(\n\ - image2d_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\ - image2d_t output, float eps)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), 0);\n\ - int2 coord_bias = (int2)(0, 0);\n\ - vxc_uchar16 src0, outval;\n\ - vxc_short8 src1;\n\ - vxc_half8 scale_h;\n\ - vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\ - vxc_float4 mean_vari = (vxc_float4)(0);\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari += read_imagef(meanVari, coord_bias);\n\ - coord_bias.x += 4;\n\ - }\n\ - mean_vari *= dimRatio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - coord_bias = coord;\n\ -\n\ - short zp = inputZP;\n\ - vxc_float4 tmpData0, tmpData1, norm;\n\ - vxc_int4 tmpVal0, tmpVal1;\n\ -\n\ - for(coord.y = 0; coord.y < height; coord.y++)\n\ - {\n\ - VXC_ReadImage(src0, input, coord, 0,\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - coord_bias.y = coord.y;\n\ - VXC_ReadImage(src1, scale, coord, 0,\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - bias_f0 = read_imagef(bias, coord_bias);\n\ - coord_bias.x += 4;\n\ - bias_f1 = read_imagef(bias, coord_bias);\n\ - coord_bias.x = coord.x;\n\ -\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ - VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert1stUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvert2ndUint8SubZpToFp32_4x4);\n\ - tmpData0 = tmpData0 * input_scale - mean_vari.s0;\n\ - tmpData1 = tmpData1 * input_scale - mean_vari.s0;\n\ -\n\ - norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\ - tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\ - norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\ - tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\ -\n\ - VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - }\n\ -}"; /* end of layer_normalization_wh_u8_vx*/ +}"; /* end of layer_normalization_3_vx*/ static const char log_softmax_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ _viv_uniform float rlogE;\n\ @@ -22546,6 +19739,7 @@ __kernel void gemm_BF16BF16toBF16(image2d_array_t inputA,\n\ sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);\n\ }\n\ coord_b.y = gidy;\n\ + coord_b.z = get_global_id(2);\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_b.w, baseAddr);\n\ @@ -22623,6 +19817,7 @@ __kernel void gemm_transa_BF16BF16toBF16(\n\ sum3 = (sum3 + tempA0.w * tempB0);\n\ }\n\ coord_b.y = gidy;\n\ + coord_b.z = get_global_id(2);\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_b.w, baseAddr);\n\ @@ -22657,7 +19852,7 @@ __kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,\n\ int adjointB,\n\ uint M, uint K, uint N)\n\ {\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ \n\ @@ -22826,6 +20021,7 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ sum3 += (tempA3);\n\ }\n\ coord_b.y = gidy;\n\ + coord_b.z = get_global_id(2);\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_b.w, baseAddr);\n\ @@ -22914,6 +20110,7 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,\n\ sum3 += (tempA3 + tempB3);\n\ }\n\ coord_b.y = gidy;\n\ + coord_b.z = get_global_id(2);\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_b.w, baseAddr);\n\ @@ -22988,7 +20185,7 @@ __kernel void gemm_F32F32toF32(\n\ sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3);\n\ sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);\n\ }\n\ - coord_b = (int4)(gidx, gidy, get_global_id(2), 0);\n\ + coord_b = (int4)(gidx, gidy, get_global_id(2), get_global_id(2));\n\ write_imagef(output, coord_b, sum0);\n\ coord_b.y++;\n\ write_imagef(output, coord_b, sum1);\n\ @@ -23083,6 +20280,7 @@ __kernel void gemm_F16F16to##dst_type_name( \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ write_type outC; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -23172,6 +20370,7 @@ __kernel void gemm_F16F16to##dst_type_name( \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ write_type outC; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -23282,6 +20481,7 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\ } \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -23370,6 +20570,7 @@ __kernel void gemm_F16##src1_type_name##toI16(image2d_array_t inputA, \\\n\ } \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -23475,6 +20676,7 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\ sum2 *= input1Scale; \\\n\ sum3 *= input1Scale; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -23571,6 +20773,7 @@ __kernel void gemm_F16##src1_type_name##toF16(image2d_array_t inputA, \\\n\ sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\ } \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -23669,6 +20872,7 @@ __kernel void gemm_F16##src1_type_name##to##src1_type_name(image2d_array_t input } \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -23777,6 +20981,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ } \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -23868,6 +21073,7 @@ __kernel void gemm_transa_##src0_type_name##src1_type_name##to##dst_type_name( \ } \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -23953,6 +21159,7 @@ __kernel void gemm_transa_##src0_type_name##F16to##src0_type_name( \\\n\ } \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -24035,6 +21242,7 @@ __kernel void gemm_transa_F16F16toF16(\n\ sum3 = (sum3 + tempA0.w * tempB0);\n\ }\n\ coord_b.y = gidy;\n\ + coord_b.z = get_global_id(2);\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\ _viv_asm(MOV, coord_b.w, baseAddr);\n\ @@ -24060,7 +21268,8 @@ __kernel void gemm_transa_F16F16toF16(\n\ _viv_asm(COPY, outC, valC, 16);\n\ VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0246, \\\n\ VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\ -}"; /* end of matrixmul_transA_vx*/ +}\n\ +"; /* end of matrixmul_transA_vx*/ static const char matrixmul_transB_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -24079,7 +21288,7 @@ __kernel void gemm_transb_F16F16toF16(image2d_array_t inputA,\n\ int adjointB,\n\ uint M, uint K, uint N)\n\ {\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ \n\ @@ -24336,7 +21545,7 @@ __kernel void gemm_transb_F16U8toU8(image2d_array_t inputA,\n\ int adjointB,\n\ uint M, uint K, uint N)\n\ {\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ \n\ @@ -24470,7 +21679,7 @@ __kernel void gemm_transb_U8U8toF16(image2d_array_t inputA,\n\ int adjointB,\n\ uint M, uint K, uint N)\n\ {\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ \n\ @@ -24605,7 +21814,7 @@ __kernel void gemm_transb_U8U8toU8(image2d_array_t inputA,\n\ int adjointB,\n\ uint M, uint K, uint N)\n\ {\n\ - int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\ int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\ \n\ @@ -24799,6 +22008,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\ } \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ coord_b.y = get_global_id(1); \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -24911,6 +22121,7 @@ __kernel void gemm_##src0_type_name##F16toF16( \\\n\ sum2 *= input0Scale; \\\n\ sum3 *= input0Scale; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -25016,6 +22227,7 @@ __kernel void gemm_##src0_type_name##F16toF16( \\\n\ sum2 *= input0Scale; \\\n\ sum3 *= input0Scale; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)gidz * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -25127,6 +22339,7 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\ } \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -25225,6 +22438,7 @@ __kernel void gemm_##src0_type_name##F16to##src0_type_name( \\\n\ } \\\n\ vxc_int4 tmpOut0, tmpOut1; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -25267,6 +22481,7 @@ _viv_uniform int bc2zero;\n\ _viv_uniform VXC_512Bits uniGemmU8U8toFp32Block4_4x4;\n\ _viv_uniform VXC_512Bits uniGemmU8U8MulZptoFp32_8x4;\n\ _viv_uniform float input01Scale;\n\ +_viv_uniform float mulKIn0In1Zp;\n\ \n\ #define GEMM_QINT_TO_F16(src0_type_name, read_type) \\\n\ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ @@ -25280,10 +22495,8 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ \\\n\ int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0); \\\n\ int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0); \\\n\ - vxc_float4 sum0 = (vxc_float4)(0); \\\n\ - vxc_float4 sum1 = (vxc_float4)(0); \\\n\ - vxc_float4 sum2 = (vxc_float4)(0); \\\n\ - vxc_float4 sum3 = (vxc_float4)(0); \\\n\ + vxc_float4 sum0 = (vxc_float4)(mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp, mulKIn0In1Zp); \\\n\ + vxc_float4 sum1 = sum0, sum2 = sum0, sum3 = sum0; \\\n\ \\\n\ int8 inputA_desc, inputB_desc, output_desc; \\\n\ _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc)); \\\n\ @@ -25341,6 +22554,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ sum2 *= input01Scale; \\\n\ sum3 *= input01Scale; \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -25442,6 +22656,7 @@ __kernel void gemm_##src0_type_name##src0_type_name##toF16( \\\n\ sum3 = (sum3 + tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3); \\\n\ } \\\n\ coord_b.y = gidy; \\\n\ + coord_b.z = get_global_id(2); \\\n\ _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\ int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0; \\\n\ _viv_asm(MOV, coord_b.w, baseAddr); \\\n\ @@ -26283,6 +23498,192 @@ MINIMUM_QUANTTOF16_2D_IMPL(U8U8toF16, vxc_uchar16)\n\ MINIMUM_QUANTTOF16_2D_IMPL(I8I8toF16, vxc_char16)\n\ MINIMUM_QUANTTOF16_2D_IMPL(I16I16toF16, vxc_short8)"; /* end of minimum_1_vx*/ +static const char mod_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4;\n\ +_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4;\n\ +\n\ +_viv_uniform float in_scale0;\n\ +_viv_uniform float in_scale1;\n\ +_viv_uniform float out_scale;\n\ +_viv_uniform float in0Tail;\n\ +_viv_uniform float in1Tail;\n\ +_viv_uniform float out_zp;\n\ +\n\ +#define MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\\\n\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\ + save_type data; \\\n\ + read_type read_data0, read_data1; \\\n\ + copy_type tmpData0, tmpData1; \\\n\ + vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \\\n\ + vxc_float4 tmpVal1, tmpVal2; \\\n\ + dst_type tmpOut1, tmpOut2; \\\n\ + read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmpData0, read_data0, 16); \\\n\ + read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmpData1, read_data1, 16); \\\n\ + VXC_DP4x4(in0Val1, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\ + VXC_DP4x4(in0Val2, tmpData0, tmpData0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\ + VXC_DP4x4(in1Val1, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\ + VXC_DP4x4(in1Val2, tmpData1, tmpData1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\ + in0Val1 = in0Val1 * IN0_SCALE + IN0_TAIL; \\\n\ + in0Val2 = in0Val2 * IN0_SCALE + IN0_TAIL; \\\n\ + in1Val1 = in1Val1 * IN1_SCALE + IN1_TAIL; \\\n\ + in1Val2 = in1Val2 * IN1_SCALE + IN1_TAIL; \\\n\ + if (isfmod) \\\n\ + { \\\n\ + tmpVal1 = fmod(in0Val1, in1Val1) * OUT_SCALE + OUT_OFFSET; \\\n\ + tmpVal2 = fmod(in0Val2, in1Val2) * OUT_SCALE + OUT_OFFSET; \\\n\ + } \\\n\ + else \\\n\ + { \\\n\ + tmpVal1 = (in0Val1 - in1Val1 * floor(in0Val1 / in1Val1)) * OUT_SCALE + OUT_OFFSET; \\\n\ + tmpVal2 = (in0Val2 - in1Val2 * floor(in0Val2 / in1Val2)) * OUT_SCALE + OUT_OFFSET; \\\n\ + } \\\n\ + _viv_asm(conv_mode, tmpOut1, tmpVal1); \\\n\ + _viv_asm(conv_mode, tmpOut2, tmpVal2); \\\n\ + VXC_DP2x8(data, tmpOut1, tmpOut2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\ + write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +\n\ +#define TENSOR_MOD(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \\\n\ + conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \\\n\ +__kernel void mod_##src0_name##src1_name##to##dst_name \\\n\ + ( \\\n\ + image2d_array_t input0, \\\n\ + image2d_array_t input1, \\\n\ + image2d_array_t output, \\\n\ + int isfmod \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\\\n\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\ +}\n\ +\n\ +\n\ +TENSOR_MOD(F16, F16, F16, half4, vxc_short8, vxc_short8,\\\n\ + vxc_half8, CONV, 1, 0, 1, 0, 1, 0)\n\ +TENSOR_MOD(F16, F16, I16, short4, vxc_short8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\ +TENSOR_MOD(F16, F16, I8, char4, vxc_char8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\ +TENSOR_MOD(F16, F16, U8, uchar4, vxc_uchar8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\ +\n\ +TENSOR_MOD(I16, I16, I16, short4, vxc_short8, vxc_short8,\\\n\ + vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\ +TENSOR_MOD(I16, I16, F16, half4, vxc_short8, vxc_short8,\\\n\ + vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\ +\n\ +TENSOR_MOD(I8, I8, I8, char4, vxc_char8, vxc_char16,\\\n\ + vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\ +TENSOR_MOD(I8, I8, F16, half4, vxc_short8, vxc_char16,\\\n\ + vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\ +\n\ +TENSOR_MOD(U8, U8, U8, uchar4, vxc_uchar8, vxc_uchar16,\\\n\ + vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\ +TENSOR_MOD(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\\\n\ + vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\ +\n\ +\n\ +#define TENSOR_MOD_2D(src0_name, src1_name, dst_name, dst_type, save_type, read_type, copy_type, \\\n\ + conv_mode, IN0_SCALE, IN0_TAIL, IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET) \\\n\ +__kernel void mod_##src0_name##src1_name##to##dst_name##_2D \\\n\ + ( \\\n\ + image2d_array_t input0, \\\n\ + image2d_array_t input1, \\\n\ + image2d_array_t output, \\\n\ + int isfmod \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + MOD_PROCESS(dst_type, save_type, read_type, copy_type, conv_mode, IN0_SCALE, IN0_TAIL,\\\n\ + IN1_SCALE, IN1_TAIL, OUT_SCALE, OUT_OFFSET, VXC_ReadImage, VXC_WriteImage); \\\n\ +}\n\ +\n\ +\n\ +TENSOR_MOD_2D(F16, F16, F16, half4, vxc_short8, vxc_short8,\\\n\ + vxc_half8, CONV, 1, 0, 1, 0, 1, 0)\n\ +TENSOR_MOD_2D(F16, F16, I16, short4, vxc_short8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\ +TENSOR_MOD_2D(F16, F16, I8, char4, vxc_char8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\ +TENSOR_MOD_2D(F16, F16, U8, uchar4, vxc_uchar8, vxc_short8,\\\n\ + vxc_half8, CONV_SAT_RTE, 1, 0, 1, 0, out_scale, out_zp)\n\ +\n\ +TENSOR_MOD_2D(I16, I16, I16, short4, vxc_short8, vxc_short8,\\\n\ + vxc_short8, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\ +TENSOR_MOD_2D(I16, I16, F16, half4, vxc_short8, vxc_short8,\\\n\ + vxc_short8, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\ +\n\ +TENSOR_MOD_2D(I8, I8, I8, char4, vxc_char8, vxc_char16,\\\n\ + vxc_char16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\ +TENSOR_MOD_2D(I8, I8, F16, half4, vxc_short8, vxc_char16,\\\n\ + vxc_char16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\ +\n\ +TENSOR_MOD_2D(U8, U8, U8, uchar4, vxc_uchar8, vxc_uchar16,\\\n\ + vxc_uchar16, CONV_SAT_RTE, in_scale0, in0Tail, in_scale1, in1Tail, out_scale, out_zp)\n\ +TENSOR_MOD_2D(U8, U8, F16, half4, vxc_short8, vxc_uchar16,\\\n\ + vxc_uchar16, CONV, in_scale0, in0Tail, in_scale1, in1Tail, 1, 0)\n\ +\n\ +\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ +_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ +_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ +\n\ +#define MOD_BF16_PROCESS(read_fun, write_fun) \\\n\ + vxc_short8 read_data0, read_data1, vec0; \\\n\ + vxc_float4 in0Val1, in0Val2, in1Val1, in1Val2; \\\n\ + vxc_float4 tmpVal1, tmpVal2; \\\n\ + vxc_ushort8 dst0, dst1; \\\n\ + vxc_ushort8 vect; \\\n\ + vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\ + read_fun(read_data0, input0, coord, VXC_5BITOFFSET_XY(0,0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, in0Val1, vec0, 16); \\\n\ + VXC_DP2x8(vec0, read_data0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, in0Val2, vec0, 16); \\\n\ + read_fun(read_data1, input1, coord, VXC_5BITOFFSET_XY(0,0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\ + _viv_asm(COPY, in1Val1, vec0, 16); \\\n\ + VXC_DP2x8(vec0, read_data1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\ + _viv_asm(COPY, in1Val2, vec0, 16); \\\n\ + tmpVal1 = fmod(in0Val1, in1Val1); \\\n\ + tmpVal2 = fmod(in0Val2, in1Val2); \\\n\ + _viv_asm(COPY, dst0, tmpVal1, 16); \\\n\ + _viv_asm(COPY, dst1, tmpVal2, 16); \\\n\ + VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\ + write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void mod_BF16BF16toBF16\n\ + (\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output,\n\ + int isfmod\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + MOD_BF16_PROCESS(VXC_ReadImage2DArray, VXC_WriteImage2DArray);\n\ +}\n\ +\n\ +__kernel void mod_BF16BF16toBF16_2D\n\ + (\n\ + image2d_array_t input0,\n\ + image2d_array_t input1,\n\ + image2d_array_t output,\n\ + int isfmod\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + MOD_BF16_PROCESS(VXC_ReadImage, VXC_WriteImage);\n\ +}"; /* end of mod_vx*/ + static const char moments_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform int width;\n\ @@ -29451,588 +26852,172 @@ __kernel void poolwithargmax_U8to_F16_I16_2D\n\ \n\ "; /* end of poolwithargmax_U8_vx*/ -static const char pow_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char pow_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ _viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ +_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\ +_viv_uniform float input0_scale;\n\ +_viv_uniform float input1_scale;\n\ +_viv_uniform float input0_tail;\n\ +_viv_uniform float input1_tail;\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ \n\ -_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\ -_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2;\n\ -\n\ -_viv_uniform int input_ZP1;\n\ -\n\ -_viv_uniform float output_ZP;\n\ -_viv_uniform float outputScale;\n\ -\n\ -__kernel void pow_F16F16toF16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, src1;\n\ - vxc_short8 dst;\n\ - vxc_half8 data0, data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +#define POW_SH_IMPL(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, copy2_type, conv_type) \\\n\ +__kernel void pow_##name \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2)); \\\n\ + \\\n\ + src0_type src0; \\\n\ + copy0_type data0; \\\n\ + src0_type src1; \\\n\ + copy0_type data1; \\\n\ + VXC_ReadImage2DArray(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data0, src0, 16); \\\n\ + VXC_ReadImage2DArray(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data1, src1, 16); \\\n\ + float4 x0, x1; \\\n\ + float4 y0, y1; \\\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \\\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \\\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \\\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \\\n\ + x0 = x0 * input0_scale + input0_tail; \\\n\ + x1 = x1 * input0_scale + input0_tail; \\\n\ + y0 = y0 * input1_scale + input1_tail; \\\n\ + y1 = y1 * input1_scale + input1_tail; \\\n\ + float4 s0 = sign(x0); \\\n\ + float4 s1 = sign(x1); \\\n\ + int4 t0 = convert_int4(y0) & 1; \\\n\ + int4 t1 = convert_int4(y1) & 1; \\\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; \\\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; \\\n\ + x0 = s0 * exp2(y0 * log2(fabs(x0))); \\\n\ + x1 = s1 * exp2(y1 * log2(fabs(x1))); \\\n\ + x0 = x0 * output_scale + output_zp; \\\n\ + x1 = x1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + _viv_asm(CONV_RTE, tmpVal0, x0); \\\n\ + _viv_asm(CONV_RTE, tmpVal1, x1); \\\n\ + dst_type dst0; \\\n\ + \\\n\ + copy2_type dst; \\\n\ + VXC_DP2x8(dst0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, 16); \\\n\ + VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ +POW_SH_IMPL(F16_F16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(F16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +POW_SH_IMPL(F16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, int4)\n\ +POW_SH_IMPL(F16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, int4)\n\ +POW_SH_IMPL(F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +POW_SH_IMPL(I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +POW_SH_IMPL(I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +POW_SH_IMPL(F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char16, vxc_char16, int4)\n\ +POW_SH_IMPL(I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, int4)\n\ +POW_SH_IMPL(I8_I8toF16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(I8_I8toI8, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, int4)\n\ +POW_SH_IMPL(F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)\n\ +POW_SH_IMPL(U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, int4)\n\ +POW_SH_IMPL(U8_U8toF16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL(U8_U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)\n\ \n\ -__kernel void pow_F16F16toF16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1;\n\ - vxc_short8 dst;\n\ - vxc_half8 data0, data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16F16toU8(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, src1;\n\ - vxc_uchar8 dst;\n\ - vxc_half8 data0, data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16F16toU8_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1;\n\ - vxc_uchar8 dst;\n\ - vxc_half8 data0, data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16U8toF16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0;\n\ - vxc_uchar8 src1;\n\ - vxc_short8 dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in1_zp;\n\ - _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, dst, data0, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16U8toF16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0;\n\ - vxc_uchar8 src1;\n\ - vxc_short8 dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in1_zp;\n\ - _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(data0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\ - _viv_asm(COPY, dst, data0, 16);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16U8toU8(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0;\n\ - vxc_uchar8 src1, dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in1_zp;\n\ - _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16U8toU8_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0;\n\ - vxc_uchar8 src1, dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in1_zp;\n\ - _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pow_fp16_vx*/ - -static const char pow_fp16_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ -\n\ -_viv_uniform float outScale_fl;\n\ -\n\ -__kernel void pow_F16F16toI16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data0, data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16F16toI16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data0, data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16I16toF16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16I16toF16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16I16toI16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16I16toI16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +#define POW_SH_IMPL_2D(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, copy2_type, conv_type) \\\n\ +__kernel void pow_##name##_2D \\\n\ + ( \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + src0_type src0; \\\n\ + copy0_type data0; \\\n\ + src0_type src1; \\\n\ + copy0_type data1; \\\n\ + VXC_ReadImage(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data0, src0, 16); \\\n\ + VXC_ReadImage(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, data1, src1, 16); \\\n\ + float4 x0, x1; \\\n\ + float4 y0, y1; \\\n\ + VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \\\n\ + VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \\\n\ + VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4); \\\n\ + VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4); \\\n\ + x0 = x0 * input0_scale + input0_tail; \\\n\ + x1 = x1 * input0_scale + input0_tail; \\\n\ + y0 = y0 * input1_scale + input1_tail; \\\n\ + y1 = y1 * input1_scale + input1_tail; \\\n\ + float4 s0 = sign(x0); \\\n\ + float4 s1 = sign(x1); \\\n\ + int4 t0 = convert_int4(y0) & 1; \\\n\ + int4 t1 = convert_int4(y1) & 1; \\\n\ + s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0; \\\n\ + s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1; \\\n\ + x0 = s0 * exp2(y0 * log2(fabs(x0))); \\\n\ + x1 = s1 * exp2(y1 * log2(fabs(x1))); \\\n\ + x0 = x0 * output_scale + output_zp; \\\n\ + x1 = x1 * output_scale + output_zp; \\\n\ + \\\n\ + conv_type tmpVal0, tmpVal1; \\\n\ + _viv_asm(CONV_RTE, tmpVal0, x0); \\\n\ + _viv_asm(CONV_RTE, tmpVal1, x1); \\\n\ + dst_type dst0; \\\n\ + \\\n\ + copy2_type dst; \\\n\ + VXC_DP2x8(dst0, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\ + _viv_asm(COPY, dst, dst0, 16); \\\n\ + VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ +POW_SH_IMPL_2D(F16_F16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(F16_F16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +POW_SH_IMPL_2D(F16_F16toI8, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, int4)\n\ +POW_SH_IMPL_2D(F16_F16toU8, vxc_short8, vxc_half8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, int4)\n\ +POW_SH_IMPL_2D(F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(F16_I16toI16, vxc_short8, vxc_half8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +POW_SH_IMPL_2D(I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(I16_F16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, vxc_short8, int4)\n\ +POW_SH_IMPL_2D(I16_I16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(I16_I16toI16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, int4)\n\ +POW_SH_IMPL_2D(F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(F16_I8toI8, vxc_short8, vxc_half8, vxc_char16, vxc_char16, vxc_char16, vxc_char16, int4)\n\ +POW_SH_IMPL_2D(I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(I8_F16toI8, vxc_char16, vxc_char16, vxc_short8, vxc_half8, vxc_char16, vxc_char16, int4)\n\ +POW_SH_IMPL_2D(I8_I8toF16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(I8_I8toI8, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, vxc_char16, int4)\n\ +POW_SH_IMPL_2D(F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(F16_U8toU8, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)\n\ +POW_SH_IMPL_2D(U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(U8_F16toU8, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16, int4)\n\ +POW_SH_IMPL_2D(U8_U8toF16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8, half4)\n\ +POW_SH_IMPL_2D(U8_U8toU8, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, vxc_uchar16, int4)\n\ \n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\ _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\ \n\ -__kernel void pow_BF16BF16toBF16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ +__kernel void pow_BF16_BF16toBF16\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ \n\ @@ -30071,10 +27056,12 @@ __kernel void pow_BF16BF16toBF16(\n\ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void pow_BF16BF16toBF16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ +__kernel void pow_BF16_BF16toBF16_2D\n\ + (\n\ + __read_only image2d_array_t input0,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output\n\ + )\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ \n\ @@ -30111,1057 +27098,7 @@ __kernel void pow_BF16BF16toBF16_2D(\n\ _viv_asm(COPY, src1, tmpDst1, 16);\n\ VXC_DP2x8(dst, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\ VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pow_fp16_i16_vx*/ - -static const char pow_fp16_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform float outScale_fl;\n\ -\n\ -__kernel void pow_F16F16toI8(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, src1;\n\ - vxc_char8 dst;\n\ - vxc_half8 data0, data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16F16toI8_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1;\n\ - vxc_char8 dst;\n\ - vxc_half8 data0, data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16I8toF16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, dst;\n\ - vxc_char8 src1;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16I8toF16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, dst;\n\ - vxc_char8 src1;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16I8toI8(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0;\n\ - vxc_char8 src1, dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_F16I8toI8_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0;\n\ - vxc_char8 src1, dst;\n\ - vxc_half8 data0;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data0, src0, 16);\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pow_fp16_i8_vx*/ - -static const char pow_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform float outScale_fl;\n\ -\n\ -__kernel void pow_I16F16toF16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I16F16toF16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I16F16toI16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I16F16toI16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I16I16toI16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I16I16toI16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_short8 src0, src1, dst;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pow_i16_vx*/ - -static const char pow_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform float outScale_fl;\n\ -\n\ -__kernel void pow_I8F16toF16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_char8 src0;\n\ - vxc_short8 src1, dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I8F16toF16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_char8 src0;\n\ - vxc_short8 src1, dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I8F16toI8(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_char8 src0, dst;\n\ - vxc_short8 src1;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I8F16toI8_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_char8 src0, dst;\n\ - vxc_short8 src1;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I8I8toI8(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_char8 src0, src1, dst;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_I8I8toI8_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_char8 src0, src1, dst;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - VXC_DP4x4(x0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outScale_fl);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outScale_fl);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pow_i8_vx*/ - -static const char pow_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4;\n\ -_viv_uniform VXC_512Bits uniConvertFstDataToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertSecDataToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertUint8SubZpToFp32_4x4_2;\n\ -_viv_uniform VXC_512Bits uniConvertSecUint8SubZpToFp32_4x4_2;\n\ -\n\ -_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\ -\n\ -_viv_uniform int input_ZP0;\n\ -_viv_uniform int input_ZP1;\n\ -_viv_uniform float output_ZP;\n\ -_viv_uniform float outputScale;\n\ -\n\ -__kernel void pow_U8F16toF16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_uchar8 src0;\n\ - vxc_short8 src1;\n\ - vxc_short8 dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in0_zp;\n\ - _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_U8F16toF16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_uchar8 src0;\n\ - vxc_short8 src1;\n\ - vxc_short8 dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in0_zp;\n\ - _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_U8F16toU8(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_uchar8 src0;\n\ - vxc_short8 src1;\n\ - vxc_uchar8 dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in0_zp;\n\ - _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_U8F16toU8_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_uchar8 src0;\n\ - vxc_short8 src1;\n\ - vxc_uchar8 dst;\n\ - vxc_half8 data1;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, data1, src1, 16);\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in0_zp;\n\ - _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y0, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstDataToFp32_4x4_2);\n\ - VXC_DP4x4(y1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecDataToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ -\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_U8U8toU8(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_uchar8 src0, src1, dst;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in0_zp, in1_zp;\n\ - _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ - _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_U8U8toU8_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_uchar8 src0, src1, dst;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in0_zp, in1_zp;\n\ - _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ - _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4_2);\n\ - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4_2);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - int4 tmpVal0 = convert_int4_rte(tmpDst0 * outputScale + output_ZP);\n\ - int4 tmpVal1 = convert_int4_rte(tmpDst1 * outputScale + output_ZP);\n\ - VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\ - uniConvertInt32toUint8_2x8);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_U8U8toF16(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - vxc_uchar8 src0;\n\ - vxc_uchar8 src1;\n\ - vxc_short8 dst;\n\ - VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in0_zp, in1_zp;\n\ - _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ - _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - vxc_half8 tmpVal;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpVal, 16);\n\ - VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}\n\ -\n\ -__kernel void pow_U8U8toF16_2D(\n\ - image2d_array_t input0,\n\ - image2d_array_t input1,\n\ - image2d_array_t output)\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - vxc_uchar8 src0;\n\ - vxc_uchar8 src1;\n\ - vxc_short8 dst;\n\ - VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - float4 x0, x1;\n\ - float4 y0, y1;\n\ - float4 tmpDst0, tmpDst1;\n\ - short in0_zp, in1_zp;\n\ - _viv_asm(COPY, in0_zp, input_ZP0, 4);\n\ - _viv_asm(COPY, in1_zp, input_ZP1, 4);\n\ - VXC_DP4x4(x0, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(x1, src0, in0_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y0, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUint8SubZpToFp32_4x4);\n\ - VXC_DP4x4(y1, src1, in1_zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecUint8SubZpToFp32_4x4);\n\ -\n\ - float4 s0 = sign(x0);\n\ - float4 s1 = sign(x1);\n\ - int4 t0 = convert_int4(y0) & 1;\n\ - int4 t1 = convert_int4(y1) & 1;\n\ - s0 = s0 == -1 ? convert_float4(t0) == 1.0f ? -1.0f : 1.0f : s0;\n\ - s1 = s1 == -1 ? convert_float4(t1) == 1.0f ? -1.0f : 1.0f : s1;\n\ - tmpDst0 = s0 * exp2(y0 * log2(fabs(x0)));\n\ - tmpDst1 = s1 * exp2(y1 * log2(fabs(x1)));\n\ -\n\ - half4 tmpVal0, tmpVal1;\n\ - vxc_half8 tmpVal;\n\ - _viv_asm(CONV, tmpVal0, tmpDst0);\n\ - _viv_asm(CONV, tmpVal1, tmpDst1);\n\ - VXC_DP2x8(tmpVal, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\ - _viv_asm(COPY, dst, tmpVal, 16);\n\ - VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ -}"; /* end of pow_u8_vx*/ +}"; /* end of pow_vx*/ static const char pre_process_bgra_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -32586,6 +28523,659 @@ _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ _viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ \n\ +#define RESIZE_BILINEAR_4X1(mean, output) \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.w; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + coord_in.x = coord.x; \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + tmp_dst = tmp_dst * output_scale - mean * output_scale + output_zp; \\\n\ + _viv_asm(CONV, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output, coord_out, dst, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\ +__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output0, \\\n\ + __write_only image2d_array_t output1, \\\n\ + __write_only image2d_array_t output2, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var \\\n\ + ) \\\n\ +{ \\\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ + \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + \\\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ + xPos += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ + int4 sx = fx0 & 0xffff8000; \\\n\ + fx0 -= sx; \\\n\ + sx = sx >> 15; \\\n\ + \\\n\ + vxc_short4 fx; \\\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniAddRShift); \\\n\ + \\\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ + int sy = fy & 0xffff8000; \\\n\ + \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + \\\n\ + vxc_uchar16 line0Y; \\\n\ + vxc_uchar16 line1Y; \\\n\ + int4 coord; \\\n\ + int4 coord_in = (int4)(0, 0, 0, 0); \\\n\ + sx = sx + *xOffset; \\\n\ + coord = sx.xyzw; \\\n\ + coord_in.y = sy + *yOffset; \\\n\ + coord_in.x = coord.x; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.w; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + coord_in.x = coord.x; \\\n\ + \\\n\ + int4 test01, temp1; \\\n\ + int4 test02, temp2; \\\n\ + int4 tt; \\\n\ + vxc_uchar4 val; \\\n\ + int2 coord_out = (int2)(xPos.x, yPos); \\\n\ + \\\n\ + vxc_uchar8 line1, line2; \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + vxc_float4 tmp_dst; \\\n\ + vxc_uchar4 u8_dst; \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + conv_type dst0; \\\n\ + dst_type dst1; \\\n\ + copy_type dst; \\\n\ + tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\ + _viv_asm(CONV, dst0, tmp_dst); \\\n\ + VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + _viv_asm(COPY, dst, dst1, 8); \\\n\ + VXC_WriteImage(output0, coord_out, dst, \\\n\ + VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + RESIZE_BILINEAR_4X1(gMean, output1) \\\n\ + RESIZE_BILINEAR_4X1(bMean, output2) \\\n\ +}\n\ +PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8)\n\ +PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8)\n\ +\n\ +#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\ +__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output0, \\\n\ + __write_only image2d_array_t output1, \\\n\ + __write_only image2d_array_t output2, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var \\\n\ + ) \\\n\ +{ \\\n\ + int2 ratioXY = (int2)(*xRatio, *yRatio); \\\n\ + int4 xPos = get_global_id(0); \\\n\ + int yPos = get_global_id(1); \\\n\ + \\\n\ + int2 ratioSufXY = (ratioXY >> 1) - (1 << 14); \\\n\ + xPos += (int4)(0, 1, 2, 3); \\\n\ + \\\n\ + int4 fx0 = xPos * ratioXY.x + ratioSufXY.x; \\\n\ + int4 sx = fx0 & 0xffff8000; \\\n\ + fx0 -= sx; \\\n\ + sx = sx >> 15; \\\n\ + \\\n\ + vxc_short4 fx; \\\n\ + VXC_DP4x4(fx, fx0, 1 << 4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAddRShift); \\\n\ + \\\n\ + int fy = yPos * ratioXY.y + ratioSufXY.y; \\\n\ + int sy = fy & 0xffff8000; \\\n\ + \\\n\ + fy -= sy; \\\n\ + sy = sy >> 15; \\\n\ + fy = (fy + (1<< 4)) >> 5; \\\n\ + \\\n\ + vxc_uchar16 line0Y; \\\n\ + vxc_uchar16 line1Y; \\\n\ + int4 coord; \\\n\ + sx = sx + *xOffset; \\\n\ + coord.xyz = sx.xyz; \\\n\ + coord.w = sy + *yOffset; \\\n\ + int2 coord1 = (int2)(sx.w, coord.w); \\\n\ + int4 coord_in = (int4)(coord.xw, 0, 0); \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord1.x; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + int4 test01, temp1; \\\n\ + int4 test02, temp2; \\\n\ + int2 coord_out = (int2)(xPos.x, yPos); \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + \\\n\ + vxc_float4 tmp_dst; \\\n\ + vxc_uchar4 u8_dst; \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + \\\n\ + int4 dst0; \\\n\ + write_type dst; \\\n\ + tmp_dst = tmp_dst * output_scale - rMean * output_scale + output_zp; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output0, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord_in.x = coord.x; \\\n\ + coord_in.z = 1; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord1.x; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + tmp_dst = tmp_dst * output_scale - gMean * output_scale + output_zp; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output1, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord_in.x = coord.x; \\\n\ + coord_in.z = 2; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.y; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord.z; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.x = coord1.x; \\\n\ + VXC_ReadImage2DArray(line0Y, input, coord_in, 0, \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_ReadImage2DArray(line1Y, input, coord_in, VXC_5BITOFFSET_XY(0, 1), \\\n\ + VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + VXC_DP4x4(test01, line0Y, line0Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp1, line0Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp1 = temp1 + test01; \\\n\ + \\\n\ + VXC_DP4x4(test02, line1Y, line1Y, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniVecShift10); \\\n\ + VXC_DP4x4(temp2, line1Y, fx, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\ + uniGetTempVal); \\\n\ + temp2 = temp2 + test02; \\\n\ + temp2 = fy * (temp2 - temp1) + (temp1 << 10); \\\n\ + VXC_DP4x4(u8_dst, temp2, 1 << 19, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniExtractBytes); \\\n\ + VXC_DP4x4(tmp_dst, u8_dst, u8_dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\ + uniConvertIntergetoF32_4x4); \\\n\ + tmp_dst = tmp_dst * output_scale - bMean * output_scale + output_zp; \\\n\ + dst0 = convert_int4_rte(tmp_dst); \\\n\ + VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniExtract8Data_2x8); \\\n\ + \\\n\ + VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\ +PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/ + +static const char pre_process_rgb888_planar_1_vx[] = "\n\ +#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\ +_viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ +\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ +#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\ +__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output0, \\\n\ + __write_only image2d_array_t output1, \\\n\ + __write_only image2d_array_t output2, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + coord.xy += (int2)(*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1, src2; \\\n\ + dst_type dst0, dst1; \\\n\ + \\\n\ + int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + coord.x = coord.z + 8; \\\n\ + float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\ + rMean * output_scale - output_zp, output_scale); \\\n\ + \\\n\ + half4 paramData_f16; \\\n\ + copy_type tmp_dst; \\\n\ + _viv_asm(CONV, paramData_f16, paramData0); \\\n\ + VXC_DP2x8(dst0, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst1, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ + VXC_WriteImage(output0, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ + VXC_WriteImage(output0, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\ + gMean * output_scale - output_zp, output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData1); \\\n\ + VXC_DP2x8(dst0, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst1, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ + VXC_WriteImage(output1, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ + VXC_WriteImage(output1, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\ + bMean * output_scale - output_zp, output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData2); \\\n\ + VXC_DP2x8(dst0, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst1, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + _viv_asm(COPY, tmp_dst, dst0, 16); \\\n\ + VXC_WriteImage(output2, coord.zw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ + VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\ +PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\ +\n\ +#define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\ +__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __write_only image2d_array_t output0, \\\n\ + __write_only image2d_array_t output1, \\\n\ + __write_only image2d_array_t output2, \\\n\ + global int *xRatio, \\\n\ + global int *yRatio, \\\n\ + global int *xOffset, \\\n\ + global int *yOffset, \\\n\ + float rMean, \\\n\ + float gMean, \\\n\ + float bMean, \\\n\ + float f32Var \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + coord.xy += (int2) (*xOffset, *yOffset); \\\n\ + vxc_uchar16 src0, src1, src2; \\\n\ + write_type dst; \\\n\ + \\\n\ + int4 coord_in = (int4)(coord.xy, 0, 0); \\\n\ + VXC_ReadImage2DArray(src0, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src1, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + coord_in.z ++; \\\n\ + VXC_ReadImage2DArray(src2, input, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 paramData0 = (float4)(rMean * output_scale - output_zp, rMean * output_scale - output_zp, \\\n\ + rMean * output_scale - output_zp, output_scale); \\\n\ + \\\n\ + half4 paramData_f16; \\\n\ + _viv_asm(CONV, paramData_f16, paramData0); \\\n\ + \\\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst, src0, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + VXC_WriteImage(output0, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 paramData1 = (float4)(gMean * output_scale - output_zp, gMean * output_scale - output_zp, \\\n\ + gMean * output_scale - output_zp, output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData1); \\\n\ + \\\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst, src1, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + VXC_WriteImage(output1, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ + \\\n\ + float4 paramData2 = (float4)(bMean * output_scale - output_zp, bMean * output_scale - output_zp, \\\n\ + bMean * output_scale - output_zp, output_scale); \\\n\ + _viv_asm(CONV, paramData_f16, paramData2); \\\n\ + \\\n\ + VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevLo_2x8); \\\n\ + VXC_DP2x8(dst, src2, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniDataMeanStddevHi_2x8); \\\n\ + VXC_WriteImage(output2, coord.zw, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\ +}\n\ +PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\ +PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\ +"; /* end of pre_process_rgb888_planar_1_vx*/ + +static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\ +_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\ +\n\ +__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output0,\n\ + __write_only image2d_array_t output1,\n\ + __write_only image2d_array_t output2,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ + int4 coord_out;\n\ +\n\ + vxc_uchar16 src0, src1, src2, src3;\n\ + vxc_uchar16 dst0, dst1, dst2;\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ + coord_out.xy = (coord_in.xy >> 2) * 3;\n\ + coord_out.zw = coord_in.yy + (int2)(1, 2);\n\ +\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ +\n\ + VXC_WriteImage(output0, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output0, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output0, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ +\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ +\n\ + VXC_WriteImage(output1, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output1, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output1, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage2DArray(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l00_2x8);\n\ + VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l10_2x8);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l01_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l11_4x4);\n\ + VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniBilinear_4over3_l21_4x4);\n\ +\n\ + VXC_WriteImage(output2, coord_out.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output2, coord_out.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +__kernel void pre_process_rgb888_planar_half_U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output0,\n\ + __write_only image2d_array_t output1,\n\ + __write_only image2d_array_t output2,\n\ + global int *xRatio,\n\ + global int *yRatio,\n\ + global int *xOffset,\n\ + global int *yOffset,\n\ + float rMean,\n\ + float gMean,\n\ + float bMean,\n\ + float f32Var\n\ + )\n\ +{\n\ + int4 coord_in = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\ +\n\ + vxc_uchar16 src0, src1, src2;\n\ +\n\ + VXC_ReadImage2DArray(src0, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ + VXC_ReadImage2DArray(src1, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord_in.z ++;\n\ + VXC_ReadImage2DArray(src2, input, coord_in, 0,\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + int2 coord = coord_in.xy >> 1;\n\ +\n\ + VXC_WriteImage(output0, coord, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output1, coord, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output2, coord, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of pre_process_rgb888_planar_2_vx*/ + +static const char pre_process_rgb888_planar_sep_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniVecShift10;\n\ +_viv_uniform VXC_512Bits uniAddRShift;\n\ +_viv_uniform VXC_512Bits uniGetTempVal;\n\ +_viv_uniform VXC_512Bits uniExtractBytes;\n\ +\n\ +_viv_uniform VXC_512Bits uniConvertIntergetoF32_4x4;\n\ +_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\ +\n\ +_viv_uniform float output_scale;\n\ +_viv_uniform float output_zp;\n\ +\n\ #define RESIZE_BILINEAR_4X1(input, mean, output) \\\n\ VXC_ReadImage(line0Y, input, coord.xw, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_ReadImage(line0Y, input, coord.yw, 0, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ @@ -32624,8 +29214,8 @@ _viv_uniform float output_zp;\n\ VXC_WriteImage(output, coord_out, dst, \\\n\ VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\ \n\ -#define PRE_PROCESS_RGB888_PLANAR_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\ -__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ +#define RGB888_PLANAR_SEP_16BITS(dst_name, dst_type, conv_type, copy_type) \\\n\ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ @@ -32729,11 +29319,11 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ RESIZE_BILINEAR_4X1(input1, gMean, output1) \\\n\ RESIZE_BILINEAR_4X1(input2, bMean, output2) \\\n\ }\n\ -PRE_PROCESS_RGB888_PLANAR_16BITS(F16, vxc_half8, half4, vxc_short8)\n\ -PRE_PROCESS_RGB888_PLANAR_16BITS(I16, vxc_short8, int4, vxc_short8)\n\ +RGB888_PLANAR_SEP_16BITS(F16, vxc_half8, half4, vxc_short8)\n\ +RGB888_PLANAR_SEP_16BITS(I16, vxc_short8, int4, vxc_short8)\n\ \n\ -#define PRE_PROCESS_RGB888_PLANAR_8BITS(dst_name, write_type) \\\n\ -__kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ +#define RGB888_PLANAR_SEP_8BITS(dst_name, write_type) \\\n\ +__kernel void pre_process_rgb888_planar_sep_scale_U8to##dst_name \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ @@ -32901,19 +29491,10 @@ __kernel void pre_process_rgb888_planar_scale_U8to##dst_name \\\n\ \\\n\ VXC_WriteImage(output2, coord_out, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -PRE_PROCESS_RGB888_PLANAR_8BITS(U8, vxc_uchar16)\n\ -PRE_PROCESS_RGB888_PLANAR_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_0_vx*/ +RGB888_PLANAR_SEP_8BITS(U8, vxc_uchar16)\n\ +RGB888_PLANAR_SEP_8BITS(I8, vxc_char16)"; /* end of pre_process_rgb888_planar_sep_0_vx*/ -static const char pre_process_rgb888_planar_1_vx[] = "/*\n\ - ============================================================================\n\ - Name : GrayScale.vx\n\ - Author : Sam\n\ - Version :\n\ - Copyright : Your copyright notice\n\ - Description :\n\ - ============================================================================\n\ - */\n\ -#include \"cl_viv_vx_ext.h\"\n\ +static const char pre_process_rgb888_planar_sep_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniDataMeanStddevLo_2x8;\n\ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ @@ -32921,8 +29502,8 @@ _viv_uniform VXC_512Bits uniDataMeanStddevHi_2x8;\n\ _viv_uniform float output_scale;\n\ _viv_uniform float output_zp;\n\ \n\ -#define PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\ -__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ +#define RGB888_PLANAR_SEP_COPY_16BITS(dst_name, dst_type, copy_type) \\\n\ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ @@ -32990,11 +29571,11 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ _viv_asm(COPY, tmp_dst, dst1, 16); \\\n\ VXC_WriteImage(output2, coord.xw, tmp_dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ }\n\ -PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\ -PRE_PROCESS_RGB888_PLANAR_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\ +RGB888_PLANAR_SEP_COPY_16BITS(F16, vxc_half8, vxc_short8)\n\ +RGB888_PLANAR_SEP_COPY_16BITS(I16, vxc_short8, vxc_short8)\n\ \n\ #define PRE_PROCESS_GRAY_COPY_8BITS(dst_name, write_type) \\\n\ -__kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ +__kernel void pre_process_rgb888_planar_sep_copy_U8to##dst_name \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ __read_only image2d_array_t input1, \\\n\ @@ -33056,9 +29637,9 @@ __kernel void pre_process_rgb888_planar_copy_U8to##dst_name \\\n\ }\n\ PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\ PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\ -"; /* end of pre_process_rgb888_planar_1_vx*/ +"; /* end of pre_process_rgb888_planar_sep_1_vx*/ -static const char pre_process_rgb888_planar_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +static const char pre_process_rgb888_planar_sep_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\ _viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\ @@ -33066,7 +29647,7 @@ _viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\ _viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\ _viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\ \n\ -__kernel void pre_process_rgb888_planar_4over3_U8toU8\n\ +__kernel void pre_process_rgb888_planar_sep_4over3_U8toU8\n\ (\n\ __read_only image2d_array_t input0,\n\ __read_only image2d_array_t input1,\n\ @@ -33148,7 +29729,7 @@ __kernel void pre_process_rgb888_planar_4over3_U8toU8\n\ VXC_WriteImage(output2, coord_out.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ }\n\ \n\ -__kernel void pre_process_rgb888_planar_half_U8toU8\n\ +__kernel void pre_process_rgb888_planar_sep_half_U8toU8\n\ (\n\ __read_only image2d_array_t input0,\n\ __read_only image2d_array_t input1,\n\ @@ -33180,7 +29761,7 @@ __kernel void pre_process_rgb888_planar_half_U8toU8\n\ VXC_WriteImage(output1, coord_in.zw, src1.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ VXC_WriteImage(output2, coord_in.zw, src2.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\ }\n\ -"; /* end of pre_process_rgb888_planar_2_vx*/ +"; /* end of pre_process_rgb888_planar_sep_2_vx*/ static const char pre_process_rgb_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -41765,6 +38346,102 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\ \n\ #endif"; /* end of resize_bilinear_U8_opt_vx*/ +static const char resize_bilinear_align_corners_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniBilinear_8x_l10_4x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_8x_l11_4x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_8x_l20_4x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_8x_l21_4x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_8x_l30_4x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_8x_l31_4x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_8x_l40_4x8;\n\ +_viv_uniform VXC_512Bits uniBilinear_8x_l41_4x8;\n\ +__kernel void resize_bilinear_U8toU8_SAME_8x_upsample_align_corners\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int align_corners,\n\ + int half_pixel_centers\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\ +\n\ +\n\ + vxc_uchar16 in0, in1, dst;\n\ +\n\ + int8 input_desc;\n\ + _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\ + int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ + VXC_OP4(img_load_3d, in0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_OP4(img_load_3d, in1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1),\n\ + VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + coord.xy = coord.xy << 3;\n\ +\n\ + int8 output_desc;\n\ + _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\ + baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\ + _viv_asm(MOV, coord.w, baseAddr);\n\ +\n\ +\n\ + VXC_DP4x8(dst, in0, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l10_4x8);\n\ + VXC_DP4x8(dst, in0, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l11_4x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ +\n\ + VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l10_4x8);\n\ + VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l11_4x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ +\n\ + VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l20_4x8);\n\ + VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l21_4x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ +\n\ + VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l30_4x8);\n\ + VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l31_4x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ +\n\ + VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l40_4x8);\n\ + VXC_DP4x8(dst, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l41_4x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ +\n\ + VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l30_4x8);\n\ + VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l31_4x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ +\n\ + VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l20_4x8);\n\ + VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l21_4x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ + coord.y ++;\n\ +\n\ + VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l10_4x8);\n\ + VXC_DP4x8(dst, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniBilinear_8x_l11_4x8);\n\ +\n\ + VXC_OP4_NoDest(img_store_3d, output, coord.xywz, dst,\n\ + VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of resize_bilinear_align_corners_vx*/ + static const char resize_bilinear_nhwc_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8;\n\ @@ -41970,6 +38647,161 @@ __kernel void resize_bilinear_nhwc_U8toU8_4x_upsample_half_pixel_centers\n\ VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\ }"; /* end of resize_bilinear_nhwc_vx*/ +static const char resize_bilinear_nhwc_bound_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ +\n\ +_viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8;\n\ +_viv_uniform int2 x_coord;\n\ +\n\ +__kernel void resize_bilinear_nhwc_bound_U8toU8_2x\n\ + (\n\ + __read_only image2d_array_t input,\n\ + image2d_array_t output,\n\ + __write_only image2d_array_t output1\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0));\n\ + int2 coord_in = (int2)(1, get_global_id(0));\n\ + coord_in.y = ((coord_out.y * 2 - 1) >> 2);\n\ + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;\n\ +\n\ + vxc_uchar16 in0, in1, in2, in3, result;\n\ +\n\ + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.z = coord_out.y + 1;\n\ +\n\ + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\ + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ + VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\ + VXC_WriteImage(output, coord_out.xz, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = x_coord.x;\n\ +\n\ + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.x = x_coord.y;\n\ +\n\ + VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\ + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ + VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 0, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\ + VXC_WriteImage(output, coord_out.xz, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l10_4x4;\n\ +__kernel void resize_bilinear_nhwc_bound_U8toU8_3x\n\ + (\n\ + __read_only image2d_array_t input,\n\ + image2d_array_t output,\n\ + __write_only image2d_array_t output1\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0));\n\ + int2 coord_in = (int2)(1, get_global_id(0));\n\ + coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;\n\ + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;\n\ +\n\ + vxc_uchar16 in0, in1, in2, in3, result;\n\ +\n\ + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.zw = coord_out.yy + (int2)(1, 2);\n\ +\n\ + VXC_DP4x4(result, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\ + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xz, in1, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(result, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\ + VXC_WriteImage(output, coord_out.xw, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = x_coord.x;\n\ +\n\ + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.x = x_coord.y;\n\ +\n\ + VXC_DP4x4(result, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\ + VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xz, in1, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ + VXC_DP4x4(result, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\ + VXC_WriteImage(output, coord_out.xw, result, VXC_MODIFIER(0, 0, 0,VXC_RM_TowardZero, 0));\n\ +}\n\ +\n\ +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l00_4x8;\n\ +_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l10_4x8;\n\ +__kernel void resize_bilinear_nhwc_bound_U8toU8_4x\n\ + (\n\ + __read_only image2d_array_t input,\n\ + image2d_array_t output,\n\ + __write_only image2d_array_t output1\n\ + )\n\ +{\n\ + int4 coord_out = (int4)(1, get_global_id(0), get_global_id(0), get_global_id(0));\n\ + int2 coord_in = (int2)(1, get_global_id(0));\n\ + coord_in.y = (coord_out.y * 2 - 3) >> 3;\n\ + coord_in.y = coord_out.y == 0 ? -1 : coord_in.y;\n\ +\n\ + vxc_uchar16 in0, in1, in2, in3, dst0, dst1;\n\ +\n\ + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.z = coord_out.y + 1;\n\ +\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\ + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x += 2;\n\ + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x -= 2;\n\ +\n\ + coord_out.zw = coord_out.zz + (int2)(1, 2);\n\ + VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\ + VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\ + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x += 2;\n\ + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_in.x = x_coord.x;\n\ +\n\ + VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ + VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ +\n\ + coord_out.x = x_coord.y;\n\ + coord_out.z = coord_out.y + 1;\n\ +\n\ + VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\ + VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\ + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x -= 2;\n\ + VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x += 2;\n\ +\n\ + coord_out.zw = coord_out.zz + (int2)(1, 2);\n\ + VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\ + VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\ + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + coord_out.x -= 2;\n\ + VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ + VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\ +}\n\ +"; /* end of resize_bilinear_nhwc_bound_vx*/ + static const char resize_nearest_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\ @@ -43161,24 +39993,25 @@ __kernel void scatter_nd_update_F16F16toU8_big(\n\ static const char select_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ _viv_uniform VXC_512Bits uniConvConditiontoDst_2x8;\n\ -_viv_uniform VXC_512Bits uniConvIntIn0toDst_2x8;\n\ -_viv_uniform VXC_512Bits uniConvIntIn1toDst_2x8;\n\ -_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In0_2x8;\n\ -_viv_uniform VXC_512Bits uniU8SubZP_MulM_PStoF16In1_2x8;\n\ -_viv_uniform int input0Zp;\n\ -_viv_uniform int input1Zp;\n\ -_viv_uniform int outputZP;\n\ -_viv_uniform VXC_512Bits uniU8AddZP_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\ +_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;\n\ +_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\ +_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\ \n\ #define SELECT_INT(type_name, read_fun, write_fun) \\\n\ - type_name tmp, src0, src1, dst, value; \\\n\ + type_name src0, src1, dst, value; \\\n\ vxc_char8 value_tmp; \\\n\ - read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + vxc_ushort8 mp0, mp1; \\\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_DP2x8(src0, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn0toDst_2x8); \\\n\ - read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_DP2x8(src1, tmp, tmp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvIntIn1toDst_2x8); \\\n\ + VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8MulAndPostShift0_Lo_2x8); \\\n\ + VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8MulAndPostShift1_Lo_2x8); \\\n\ read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ VXC_DP2x8(value, value_tmp, value_tmp,\\\n\ @@ -43198,6 +40031,7 @@ __kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name( \\\n\ }\n\ \n\ SELECT_INT_FUN(I8, I8, I8, vxc_char8)\n\ +SELECT_INT_FUN(I8, U8, U8, vxc_uchar8)\n\ SELECT_INT_FUN(I8, I16, I16, vxc_short8)\n\ \n\ #define SELECT_INT_FUN_2D(cond_name, src_name, dst_name, type_name) \\\n\ @@ -43212,6 +40046,7 @@ __kernel void select_##cond_name##_##src_name##_##src_name##to##dst_name##_2D( \ }\n\ \n\ SELECT_INT_FUN_2D(I8, I8, I8, vxc_char8)\n\ +SELECT_INT_FUN_2D(I8, U8, U8, vxc_uchar8)\n\ SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8)\n\ \n\ #define SELECT_HALF(read_fun, write_fun) \\\n\ @@ -43248,47 +40083,111 @@ __kernel void select_I8_F16_F16toF16_2D(\n\ SELECT_HALF(VXC_ReadImage, VXC_WriteImage)\n\ }\n\ \n\ -#define SELECT_U8(read_fun, write_fun) \\\n\ - vxc_uchar8 tmp, src0, src1, dst; \\\n\ - vxc_char8 value; \\\n\ - vxc_half8 tmp1; \\\n\ - vxc_uchar16 input0_ZP, input1_ZP, output_ZP; \\\n\ - _viv_asm(COPY, input0_ZP, input0Zp, 4); \\\n\ - _viv_asm(COPY, input1_ZP, input1Zp, 4); \\\n\ - _viv_asm(COPY, output_ZP, outputZP, 4); \\\n\ - read_fun(tmp, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ +#define SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, read_fun, write_fun) \\\n\ + vxc_short8 src0, src1, dst, value; \\\n\ + vxc_half8 value0, value1; \\\n\ + src0_type r0; \\\n\ + src1_type r1; \\\n\ + copy0_type v0; \\\n\ + copy1_type v1; \\\n\ + vxc_char8 value_tmp; \\\n\ + vxc_ushort8 mp0, mp1; \\\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\ + _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\ + read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_DP2x8(tmp1, tmp, input0_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniU8SubZP_MulM_PStoF16In0_2x8); \\\n\ - VXC_DP2x8(src0, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \\\n\ - read_fun(tmp, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + _viv_asm(COPY, v0, src0, 16); \\\n\ + read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ - VXC_DP2x8(tmp1, tmp, input1_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ - uniU8SubZP_MulM_PStoF16In1_2x8); \\\n\ - VXC_DP2x8(src1, tmp1, output_ZP, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniU8AddZP_2x8); \\\n\ - read_fun(value, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + _viv_asm(COPY, v1, src1, 16); \\\n\ + VXC_DP2x8(value0, v0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8MulAndPostShift0_Lo_2x8); \\\n\ + _viv_asm(COPY, src0, value0, 16); \\\n\ + VXC_DP2x8(value1, v1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ + uniU8MulAndPostShift1_Lo_2x8); \\\n\ + _viv_asm(COPY, src1, value1, 16); \\\n\ + read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(value, value_tmp, value_tmp,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\ dst = (value != 0 ? src0 : src1); \\\n\ write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ \n\ -__kernel void select_I8_U8_U8toU8(\n\ +#define SELECT_HYBRID_TOF16_FUN(name, src0_type, copy0_type, src1_type, copy1_type) \\\n\ +__kernel void select_##name( \\\n\ + __read_only image2d_array_t condition, \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \\\n\ + VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\ +}\n\ +SELECT_HYBRID_TOF16_FUN(I8_F16_U8toF16, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16)\n\ +SELECT_HYBRID_TOF16_FUN(I8_U8_F16toF16, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8)\n\ +SELECT_HYBRID_TOF16_FUN(I8_F16_I8toF16, vxc_short8, vxc_half8, vxc_char16, vxc_char16)\n\ +SELECT_HYBRID_TOF16_FUN(I8_I8_F16toF16, vxc_char16, vxc_char16, vxc_short8, vxc_half8)\n\ +SELECT_HYBRID_TOF16_FUN(I8_F16_I16toF16, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +SELECT_HYBRID_TOF16_FUN(I8_I16_F16toF16, vxc_short8, vxc_short8, vxc_short8, vxc_half8)\n\ +\n\ +#define SELECT_HYBRID_TOF16_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type) \\\n\ +__kernel void select_##name( \\\n\ + __read_only image2d_array_t condition, \\\n\ + __read_only image2d_array_t input0, \\\n\ + __read_only image2d_array_t input1, \\\n\ + __write_only image2d_array_t output) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \\\n\ + VXC_ReadImage, VXC_WriteImage) \\\n\ +}\n\ +SELECT_HYBRID_TOF16_FUN_2D(I8_F16_U8toF16_2D, vxc_short8, vxc_half8, vxc_uchar16, vxc_uchar16)\n\ +SELECT_HYBRID_TOF16_FUN_2D(I8_U8_F16toF16_2D, vxc_uchar16, vxc_uchar16, vxc_short8, vxc_half8)\n\ +SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I8toF16_2D, vxc_short8, vxc_half8, vxc_char16, vxc_char16)\n\ +SELECT_HYBRID_TOF16_FUN_2D(I8_I8_F16toF16_2D, vxc_char16, vxc_char16, vxc_short8, vxc_half8)\n\ +SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I16toF16_2D, vxc_short8, vxc_half8, vxc_short8, vxc_short8)\n\ +SELECT_HYBRID_TOF16_FUN_2D(I8_I16_F16toF16_2D, vxc_short8, vxc_short8, vxc_short8, vxc_half8)\n\ +\n\ +#define SELECT_HALF_TO_QINT(read_fun, write_fun, dst_type) \\\n\ + vxc_short8 src0, src1, tmp_dst, value; \\\n\ + vxc_half8 data; \\\n\ + dst_type dst; \\\n\ + vxc_char8 value_tmp; \\\n\ + read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\ + VXC_DP2x8(value, value_tmp, value_tmp,\\\n\ + VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\ + tmp_dst = (value != 0 ? src0 : src1); \\\n\ + _viv_asm(COPY, data, tmp_dst, 16); \\\n\ + vxc_ushort8 mp0; \\\n\ + _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\ + VXC_DP2x8(dst, data, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + uniU8MulAndPostShift0_Lo_2x8); \\\n\ + write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ +\n\ +__kernel void select_I8_F16_F16toU8(\n\ __read_only image2d_array_t condition,\n\ __read_only image2d_array_t input0,\n\ __read_only image2d_array_t input1,\n\ __write_only image2d_array_t output)\n\ {\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ - SELECT_U8(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\ + SELECT_HALF_TO_QINT(VXC_ReadImage2DArray, VXC_WriteImage2DArray, vxc_uchar16)\n\ }\n\ \n\ -__kernel void select_I8_U8_U8toU8_2D(\n\ +__kernel void select_I8_F16_F16toU8_2D(\n\ __read_only image2d_array_t condition,\n\ __read_only image2d_array_t input0,\n\ __read_only image2d_array_t input1,\n\ __write_only image2d_array_t output)\n\ {\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ - SELECT_U8(VXC_ReadImage, VXC_WriteImage)\n\ + SELECT_HALF_TO_QINT(VXC_ReadImage, VXC_WriteImage, vxc_uchar16)\n\ }\n\ "; /* end of select_vx*/ @@ -43667,7 +40566,7 @@ __kernel void slice_##name0##_I32to##name1##_2D \\\n\ SLICE_8BITSTO8BITS_2D(I8, I8, vxc_char16, vxc_char16)\n\ SLICE_8BITSTO8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\ \n\ -#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type) \\\n\ +#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type, save_type) \\\n\ __kernel void slice_##name0##_I32to##name1 \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ @@ -43679,7 +40578,7 @@ __kernel void slice_##name0##_I32to##name1 \\\n\ int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ src_type src; \\\n\ copy_type src0; \\\n\ - dst_type dst; \\\n\ + dst_type result; \\\n\ int4 coord_in; \\\n\ Image begin_img = create_image_from_image2d(input1, 4); \\\n\ uchar* begin_ptr = begin_img.ptr; \\\n\ @@ -43691,15 +40590,19 @@ __kernel void slice_##name0##_I32to##name1 \\\n\ \\\n\ vxc_ushort8 multiplier; \\\n\ _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ - VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(result, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_Lo_2x8); \\\n\ + save_type dst; \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ -SLICE_16BITS_TO(F16, I8, vxc_half8, vxc_short8, vxc_char16)\n\ -SLICE_16BITS_TO(F16, U8, vxc_half8, vxc_short8, vxc_uchar16)\n\ -SLICE_16BITS_TO(F16, I16, vxc_half8, vxc_short8, vxc_short8)\n\ +SLICE_16BITS_TO(F16, I8, vxc_half8, vxc_short8, vxc_char16, vxc_char16)\n\ +SLICE_16BITS_TO(F16, U8, vxc_half8, vxc_short8, vxc_uchar16, vxc_uchar16)\n\ +SLICE_16BITS_TO(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8)\n\ +SLICE_16BITS_TO(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ +SLICE_16BITS_TO(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8)\n\ \n\ -#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type) \\\n\ +#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type, save_type) \\\n\ __kernel void slice_##name0##_I32to##name1##_2D \\\n\ ( \\\n\ __read_only image2d_array_t input0, \\\n\ @@ -43711,7 +40614,7 @@ __kernel void slice_##name0##_I32to##name1##_2D \\\n\ int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ src_type src; \\\n\ copy_type src0; \\\n\ - dst_type dst; \\\n\ + dst_type result; \\\n\ int2 coord_in; \\\n\ Image begin_img = create_image_from_image2d(input1, 4); \\\n\ uchar* begin_ptr = begin_img.ptr; \\\n\ @@ -43723,13 +40626,18 @@ __kernel void slice_##name0##_I32to##name1##_2D \\\n\ \\\n\ vxc_ushort8 multiplier; \\\n\ _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\ - VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ + VXC_DP2x8(result, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\ uniU8MulAndPostShift_Lo_2x8); \\\n\ + save_type dst; \\\n\ + _viv_asm(COPY, dst, result, 16); \\\n\ VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\ }\n\ -SLICE_16BITS_TO_2D(F16, I8, vxc_half8, vxc_short8, vxc_char16)\n\ -SLICE_16BITS_TO_2D(F16, U8, vxc_half8, vxc_short8, vxc_uchar16)\n\ -SLICE_16BITS_TO_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8)"; /* end of slice_vx*/ +SLICE_16BITS_TO_2D(F16, I8, vxc_half8, vxc_short8, vxc_char16, vxc_char16)\n\ +SLICE_16BITS_TO_2D(F16, U8, vxc_half8, vxc_short8, vxc_uchar16, vxc_uchar16)\n\ +SLICE_16BITS_TO_2D(F16, I16, vxc_half8, vxc_short8, vxc_short8, vxc_short8)\n\ +SLICE_16BITS_TO_2D(I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8)\n\ +SLICE_16BITS_TO_2D(I16, F16, vxc_short8, vxc_short8, vxc_half8, vxc_short8)\n\ +"; /* end of slice_vx*/ static const char space2depth_internal_vx[] = "#include \"cl_viv_vx_ext.h\"\n\ \n\ @@ -47586,6 +44494,802 @@ __kernel void clip_U8toF32_2D(\n\ }\n\ "; /* end of clip_U8_cl*/ +static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int channel,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord_out.z = channel - 1;\n\ + write_imagef(output, coord_out, sum);\n\ +\n\ + for(coord.z = channel - 1; coord.z > 0; coord.z--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.z--;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord_out.z = 0;\n\ + write_imagef(output, coord_out, sum);\n\ + for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.z++;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_U8toU8_axis2(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int channel,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + uint4 sum = (uint4)(0);\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + float cnt = 0.0f;\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord_out.z = channel - 1;\n\ + write_imageui(output, coord_out, dst);\n\ + for(coord.z = channel - 1; coord.z > 0; coord.z--)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + coord_out.z--;\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord_out, dst);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord_out.z = 0;\n\ + write_imageui(output, coord_out, dst);\n\ + for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + coord_out.z++;\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord_out, dst);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.z = 0; coord.z < channel; coord.z++)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_F32toF32_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int channel,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord_out.y = height - 1;\n\ + write_imagef(output, coord_out, sum);\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.y--;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord_out.y = 0;\n\ + write_imagef(output, coord_out, sum);\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.y++;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_U8toU8_axis1(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int channel,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + uint4 sum = (uint4)(0);\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + float cnt = 0;\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord_out.y = height - 1;\n\ + write_imageui(output, coord_out, dst);\n\ +\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + cnt += 1.0f;\n\ + coord_out.y--;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord_out, dst);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord_out.y = 0;\n\ + write_imageui(output, coord_out, dst);\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + cnt += 1.0f;\n\ + coord_out.y++;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord_out, dst);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_F32toF32_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int channel,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord_out.x = width - 1;\n\ + write_imagef(output, coord_out, sum);\n\ + for(coord.x = width - 1; coord.x > 0; coord.x--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.x--;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord_out.x = 0;\n\ + write_imagef(output, coord_out, sum);\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + coord_out.x++;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord_out, sum);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ + {\n\ + float4 data = read_imagef(input, coord);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord, sum);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_U8toU8_axis0(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int channel,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 coord_out = coord;\n\ +\n\ + uint4 sum = (uint4)(0);\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + float cnt = 0;\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord_out.x = width - 1;\n\ + write_imageui(output, coord_out, dst);\n\ + for(coord.x = width - 1; coord.x > 0; coord.x--)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + coord_out.x--;\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord_out, dst);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord_out.x = 0;\n\ + write_imageui(output, coord_out, dst);\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + coord_out.x++;\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord_out, dst);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ + {\n\ + uint4 data = read_imageui(input, coord);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord, dst);\n\ + }\n\ + }\n\ +}\n\ +"; /* end of cumsum_cl*/ + +static const char cumsum_2d_cl[] = "\n\ +__kernel void cumsum_F32toF32_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord.w = height - 1;\n\ + write_imagef(output, coord.zw, sum);\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + coord.w--;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.zw, sum);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + write_imagef(output, coord.zw, sum);\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + coord.w++;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.zw, sum);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.xy, sum);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.xy, sum);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_U8toU8_axis1_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + uint4 sum = (uint4)(0);\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + float cnt = 0;\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord.w = height - 1;\n\ + write_imageui(output, coord.zw, sum);\n\ + for(coord.y = height - 1; coord.y > 0; coord.y--)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + coord.w--;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + write_imageui(output, coord.zw, sum);\n\ + for(coord.y = 0; coord.y < height - 1; coord.y++)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + coord.w++;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.y = 0; coord.y < height; coord.y++)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_F32toF32_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + float4 sum = (float4)(0);\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord.x = width - 1;\n\ + coord.z = coord.x;\n\ + write_imagef(output, coord.zw, sum);\n\ + for(; coord.x > 0; coord.x--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + coord.z--;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.zw, sum);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord.z = 0;\n\ + write_imagef(output, coord.zw, sum);\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + coord.z++;\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.zw, sum);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.xy, sum);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ + sum += data;\n\ +\n\ + write_imagef(output, coord.xy, sum);\n\ + }\n\ + }\n\ +}\n\ +\n\ +__kernel void cumsum_U8toU8_axis0_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + int axis,\n\ + int exclusive,\n\ + int rev,\n\ + int width,\n\ + int height,\n\ + int chn,\n\ + int input_zp,\n\ + float in_out_scale,\n\ + float in_out_zp_scale,\n\ + float output_zp\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + uint4 sum = (uint4)(0);\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + float cnt = 0.0f;\n\ +\n\ + if(exclusive && rev)\n\ + {\n\ + coord.x = width - 1;\n\ + coord.z = coord.x;\n\ + write_imageui(output, coord.zw, sum);\n\ + for(; coord.x > 0; coord.x--)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + coord.z--;\n\ + cnt += 1.0;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(exclusive)\n\ + {\n\ + coord.z = 0;\n\ + write_imageui(output, coord.zw, sum);\n\ + for(coord.x = 0; coord.x < width - 1; coord.x++)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + coord.z++;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.zw, dst);\n\ + }\n\ + }\n\ + else if(rev)\n\ + {\n\ + for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ + else\n\ + {\n\ + for(coord.x = 0; coord.x < width; coord.x++)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ + cnt += 1.0f;\n\ + sum += data;\n\ +\n\ + float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\ + float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\ +\n\ + dst.x = (uint)convert_int_rte(tmpSum);\n\ + write_imageui(output, coord.xy, dst);\n\ + }\n\ + }\n\ +}\n\ +"; /* end of cumsum_2d_cl*/ + static const char depth2space_crd_cl[] = "\n\ __kernel void depth2space_crd_F32toF32(\n\ image2d_array_t input, image2d_array_t output, int block_size)\n\ @@ -47949,6 +45653,21 @@ float eltwise_unary_celu(float val, float alpha, float rcp_alpha)\n\ return val < 0 ? x : val;\n\ }\n\ \n\ +float eltwise_unary_rcp(float val, float alpha, float rcp_alpha)\n\ +{\n\ + return 1.0f / val;\n\ +}\n\ +\n\ +float eltwise_unary_sign(float val, float alpha, float rcp_alpha)\n\ +{\n\ + return sign(val);\n\ +}\n\ +\n\ +float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)\n\ +{\n\ + return val / (1.0f + fabs(val));\n\ +}\n\ +\n\ #define ELTWISE_UNARY_F32_2D(func_name) \\\n\ __kernel void func_name##_F32toF32_2D \\\n\ ( \\\n\ @@ -47983,6 +45702,9 @@ ELTWISE_UNARY_F32_2D(gelu)\n\ ELTWISE_UNARY_F32_2D(hard_gelu)\n\ ELTWISE_UNARY_F32_2D(selu)\n\ ELTWISE_UNARY_F32_2D(celu)\n\ +ELTWISE_UNARY_F32_2D(rcp)\n\ +ELTWISE_UNARY_F32_2D(sign)\n\ +ELTWISE_UNARY_F32_2D(softsign)\n\ \n\ #define ELTWISE_UNARY_U8_2D(func_name) \\\n\ __kernel void func_name##_U8toU8_2D \\\n\ @@ -48019,6 +45741,9 @@ ELTWISE_UNARY_U8_2D(gelu)\n\ ELTWISE_UNARY_U8_2D(hard_gelu)\n\ ELTWISE_UNARY_U8_2D(selu)\n\ ELTWISE_UNARY_U8_2D(celu)\n\ +ELTWISE_UNARY_U8_2D(rcp)\n\ +ELTWISE_UNARY_U8_2D(sign)\n\ +ELTWISE_UNARY_U8_2D(softsign)\n\ \n\ __kernel void neg_I32toI32_2D\n\ (\n\ @@ -48179,6 +45904,21 @@ float eltwise_unary_celu(float val, float alpha, float rcp_alpha)\n\ return val < 0 ? x : val;\n\ }\n\ \n\ +float eltwise_unary_rcp(float val, float alpha, float rcp_alpha)\n\ +{\n\ + return 1.0f / val;\n\ +}\n\ +\n\ +float eltwise_unary_sign(float val, float alpha, float rcp_alpha)\n\ +{\n\ + return sign(val);\n\ +}\n\ +\n\ +float eltwise_unary_softsign(float val, float alpha, float rcp_alpha)\n\ +{\n\ + return val / (1.0f + fabs(val));\n\ +}\n\ +\n\ #define ELTWISE_UNARY_F32(func_name) \\\n\ __kernel void func_name##_F32toF32 \\\n\ ( \\\n\ @@ -48213,6 +45953,9 @@ ELTWISE_UNARY_F32(gelu)\n\ ELTWISE_UNARY_F32(hard_gelu)\n\ ELTWISE_UNARY_F32(selu)\n\ ELTWISE_UNARY_F32(celu)\n\ +ELTWISE_UNARY_F32(rcp)\n\ +ELTWISE_UNARY_F32(sign)\n\ +ELTWISE_UNARY_F32(softsign)\n\ \n\ #define ELTWISE_UNARY_U8(func_name) \\\n\ __kernel void func_name##_U8toU8 \\\n\ @@ -48249,6 +45992,9 @@ ELTWISE_UNARY_U8(gelu)\n\ ELTWISE_UNARY_U8(hard_gelu)\n\ ELTWISE_UNARY_U8(selu)\n\ ELTWISE_UNARY_U8(celu)\n\ +ELTWISE_UNARY_U8(rcp)\n\ +ELTWISE_UNARY_U8(sign)\n\ +ELTWISE_UNARY_U8(softsign)\n\ \n\ __kernel void neg_I32toI32\n\ (\n\ @@ -50552,16 +48298,13 @@ __kernel void hswish_I32toI32_2D(\n\ }\n\ "; /* end of hswish_cl*/ -static const char instance_normalization_f16_cl[] = "__kernel void instance_norm_meanvari_F16(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int width,\n\ - int height\n\ +static const char instance_normalization_f32_cl[] = "__kernel void instance_norm_sums_F32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int width,\n\ + int height\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -50581,8 +48324,8 @@ static const char instance_normalization_f16_cl[] = "__kernel void instance_norm {\n\ data = read_imagef(input, coord);\n\ coord.y++;\n\ - sum += data.x;\n\ - sqr += data.x * data.x;\n\ + sum = sum + data.x;\n\ + sqr = sqr + data.x * data.x;\n\ }\n\ }\n\ lcl_sum[lidx] = sum;\n\ @@ -50612,16 +48355,13 @@ static const char instance_normalization_f16_cl[] = "__kernel void instance_norm }\n\ }\n\ \n\ -__kernel void instance_norm_meanvari_F16_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int width,\n\ - int height\n\ +__kernel void instance_norm_sums_F32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int width,\n\ + int height\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -50643,239 +48383,8 @@ __kernel void instance_norm_meanvari_F16_2D(\n\ {\n\ data = read_imagef(input, coord);\n\ coord.y++;\n\ - sum += data.x;\n\ - sqr += data.x * data.x;\n\ - }\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 dst = (float4)(0);\n\ - dst.x = sum;\n\ - write_imagef(output, coord_out.xy, dst);\n\ - coord_out.x++;\n\ - dst.x = sqr;\n\ - write_imagef(output, coord_out.xy, dst);\n\ - }\n\ -}\n\ -\n\ -__kernel void instance_norm_F16toF16(\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_array_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ - )\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\ - int4 coord_para = (int4)(0, gidz, 0, 0);\n\ -\n\ - float4 gamma = read_imagef(scale, coord_para.yx);\n\ - float4 beta = read_imagef(bias, coord_para.yx);\n\ - float4 mean_vari = (float4)(0);\n\ - float scale_vari, bias_val;\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ - coord_para.x++;\n\ - mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ - coord_para.x+=3;\n\ - }\n\ - mean_vari *= dim_ratio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = gamma.s0 * mean_vari.s1;\n\ - bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - float4 data, dst;\n\ - for(coord.y = 0; coord.y < height;coord.y++)\n\ - {\n\ - data = read_imagef(input, coord);\n\ -\n\ - dst.x = data.x * scale_vari + bias_val;\n\ - write_imagef(output, coord, dst);\n\ - }\n\ -}\n\ -\n\ -__kernel void instance_norm_F16toF16_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ - )\n\ -{\n\ - int gidz = get_global_id(1);\n\ - int gidy = gidz * height;\n\ - int2 coord = (int2)(get_global_id(0), gidy);\n\ - int2 coord_para = (int2)(0, gidz);\n\ - int endH = gidy + height;\n\ -\n\ - float4 gamma = read_imagef(scale, coord_para.yx);\n\ - float4 beta = read_imagef(bias, coord_para.yx);\n\ - float4 mean_vari = (float4)(0);\n\ - float scale_vari, bias_val;\n\ -\n\ - for(int i = 0; i < group_num; i++)\n\ - {\n\ - mean_vari.x += read_imagef(meanVari, coord_para.xy).x;\n\ - coord_para.x++;\n\ - mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ - coord_para.x+=3;\n\ - }\n\ - mean_vari *= dim_ratio;\n\ - mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ - mean_vari.s1 = rsqrt(mean_vari.s1);\n\ -\n\ - scale_vari = gamma.s0 * mean_vari.s1;\n\ - bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ -\n\ - float4 data, dst;\n\ - for(; coord.y < endH; coord.y++)\n\ - {\n\ - data = read_imagef(input, coord);\n\ -\n\ - dst.x = data.x * scale_vari + bias_val;\n\ - write_imagef(output, coord, dst);\n\ - }\n\ -}\n\ -"; /* end of instance_normalization_f16_cl*/ - -static const char instance_normalization_f32_cl[] = "__kernel void instance_norm_meanvari_F32(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int width,\n\ - int height\n\ - )\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidz = get_global_id(1);\n\ - int lidx = get_local_id(0);\n\ -\n\ - int4 coord = (int4)(gidx, 0, gidz, 0);\n\ - float4 data;\n\ - float sum = 0, sqr = 0;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - if(gidx < width)\n\ - {\n\ - for(coord.y = 0; coord.y < height;)\n\ - {\n\ - data = read_imagef(input, coord);\n\ - coord.y++;\n\ - sum += data.x;\n\ - sqr += data.x * data.x;\n\ - }\n\ - }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ - barrier(CLK_LOCAL_MEM_FENCE);\n\ -\n\ - int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ - if(lidx == 0)\n\ - {\n\ - float4 one = (float4)(1, 1, 1, 1);\n\ - __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ - __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ -\n\ - sum = 0; sqr = 0;\n\ - for(int i = 0; i < 4; i++)\n\ - {\n\ - sum += dot(tmp_sum[i], one);\n\ - sqr += dot(tmp_sqr[i], one);\n\ - }\n\ -\n\ - float4 dst = (float4)(0);\n\ - dst.x = sum;\n\ - write_imagef(output, coord_out.xy, dst);\n\ - coord_out.x++;\n\ - dst.x = sqr;\n\ - write_imagef(output, coord_out.xy, dst);\n\ - }\n\ -}\n\ -\n\ -__kernel void instance_norm_meanvari_F32_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int width,\n\ - int height\n\ - )\n\ -{\n\ - int gidx = get_global_id(0);\n\ - int gidz = get_global_id(1);\n\ - int lidx = get_local_id(0);\n\ - int gidy = gidz * height;\n\ -\n\ - int2 coord = (int2)(gidx, gidy);\n\ - float4 data;\n\ - float sum = 0, sqr = 0;\n\ -\n\ - __local float lcl_sum[16];\n\ - __local float lcl_sqr[16];\n\ -\n\ - int endH = gidy + height;\n\ - if(gidx < width)\n\ - {\n\ - for(; coord.y < endH;)\n\ - {\n\ - data = read_imagef(input, coord);\n\ - coord.y++;\n\ - sum += data.x;\n\ - sqr += data.x * data.x;\n\ + sum = sum + data.x;\n\ + sqr = sqr + data.x * data.x;\n\ }\n\ }\n\ lcl_sum[lidx] = sum;\n\ @@ -50906,23 +48415,19 @@ __kernel void instance_norm_meanvari_F32_2D(\n\ }\n\ \n\ __kernel void instance_norm_F32toF32(\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_array_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -50941,7 +48446,7 @@ __kernel void instance_norm_F32toF32(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ @@ -50959,23 +48464,19 @@ __kernel void instance_norm_F32toF32(\n\ }\n\ \n\ __kernel void instance_norm_F32toF32_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -50996,12 +48497,12 @@ __kernel void instance_norm_F32toF32_2D(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = gamma.s0 * mean_vari.s1;\n\ - bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ \n\ float4 data, dst;\n\ for(; coord.y < endH; coord.y++)\n\ @@ -51014,16 +48515,13 @@ __kernel void instance_norm_F32toF32_2D(\n\ }\n\ "; /* end of instance_normalization_f32_cl*/ -static const char instance_normalization_i32_cl[] = "__kernel void instance_norm_meanvari_I32(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int width,\n\ - int height\n\ +static const char instance_normalization_i32_cl[] = "__kernel void instance_norm_sums_I32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int width,\n\ + int height\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -51032,9 +48530,8 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm \n\ int4 coord = (int4)(gidx, 0, gidz, 0);\n\ int4 data;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0;\n\ - float e2InScale = input_fl * input_fl;\n\ + float2 sum_x_x2 = 0;\n\ + int2 _sum_x_x2 = 0;\n\ \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ @@ -51045,13 +48542,13 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm {\n\ data = read_imagei(input, coord);\n\ coord.y++;\n\ - tmpSum += data.x;\n\ - sqr += (data.x * data.x * e2InScale);\n\ + _sum_x_x2.x = _sum_x_x2.x + data.x;\n\ + _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\ }\n\ - sum = tmpSum * input_fl;\n\ + sum_x_x2 = convert_float2(_sum_x_x2);\n\ }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ + lcl_sum[lidx] = sum_x_x2.x;\n\ + lcl_sqr[lidx] = sum_x_x2.y;\n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ @@ -51061,7 +48558,7 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ \n\ - sum = 0; sqr = 0;\n\ + float sum = 0, sqr = 0;\n\ for(int i = 0; i < 4; i++)\n\ {\n\ sum += dot(tmp_sum[i], one);\n\ @@ -51077,16 +48574,13 @@ static const char instance_normalization_i32_cl[] = "__kernel void instance_norm }\n\ }\n\ \n\ -__kernel void instance_norm_meanvari_I32_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int width,\n\ - int height\n\ +__kernel void instance_norm_sums_I32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int width,\n\ + int height\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -51096,9 +48590,8 @@ __kernel void instance_norm_meanvari_I32_2D(\n\ \n\ int2 coord = (int2)(gidx, gidy);\n\ int4 data;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0;\n\ - float e2InScale = input_fl * input_fl;\n\ + float2 sum_x_x2 = 0;\n\ + int2 _sum_x_x2 = 0;\n\ \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ @@ -51110,13 +48603,13 @@ __kernel void instance_norm_meanvari_I32_2D(\n\ {\n\ data = read_imagei(input, coord);\n\ coord.y++;\n\ - tmpSum += data.x;\n\ - sqr += (data.x * data.x * e2InScale);\n\ + _sum_x_x2.x = _sum_x_x2.x + data.x;\n\ + _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\ }\n\ - sum = tmpSum * input_fl;\n\ + sum_x_x2 = convert_float2(_sum_x_x2);\n\ }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ + lcl_sum[lidx] = sum_x_x2.x;\n\ + lcl_sqr[lidx] = sum_x_x2.y;\n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ @@ -51126,7 +48619,7 @@ __kernel void instance_norm_meanvari_I32_2D(\n\ __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ \n\ - sum = 0; sqr = 0;\n\ + float sum = 0, sqr = 0;\n\ for(int i = 0; i < 4; i++)\n\ {\n\ sum += dot(tmp_sum[i], one);\n\ @@ -51143,23 +48636,19 @@ __kernel void instance_norm_meanvari_I32_2D(\n\ }\n\ \n\ __kernel void instance_norm_I32toI32(\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_array_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -51178,13 +48667,13 @@ __kernel void instance_norm_I32toI32(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = gamma.s0 * mean_vari.s1;\n\ - float alpha = input_fl * output_fl * scale_vari;\n\ - bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl;\n\ + float alpha = output_scale * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ \n\ int4 data, dst;\n\ for(coord.y = 0; coord.y < height;coord.y++)\n\ @@ -51199,23 +48688,19 @@ __kernel void instance_norm_I32toI32(\n\ }\n\ \n\ __kernel void instance_norm_I32toI32_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -51236,13 +48721,13 @@ __kernel void instance_norm_I32toI32_2D(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = gamma.s0 * mean_vari.s1;\n\ - float alpha = input_fl * output_fl * scale_vari;\n\ - bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_fl;\n\ + float alpha = output_scale * scale_vari;\n\ + bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ \n\ int4 data, dst;\n\ for(; coord.y < endH; coord.y++)\n\ @@ -51257,23 +48742,19 @@ __kernel void instance_norm_I32toI32_2D(\n\ }\n\ \n\ __kernel void instance_norm_I32toF32(\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_array_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -51292,12 +48773,12 @@ __kernel void instance_norm_I32toF32(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = gamma.s0 * mean_vari.s1;\n\ - float alpha = input_fl * scale_vari;\n\ + float alpha = scale_vari;\n\ bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\ \n\ int4 data;\n\ @@ -51312,23 +48793,19 @@ __kernel void instance_norm_I32toF32(\n\ }\n\ \n\ __kernel void instance_norm_I32toF32_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -51349,12 +48826,12 @@ __kernel void instance_norm_I32toF32_2D(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = gamma.s0 * mean_vari.s1;\n\ - float alpha = input_fl * scale_vari;\n\ + float alpha = scale_vari;\n\ bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\ \n\ int4 data;\n\ @@ -51369,16 +48846,13 @@ __kernel void instance_norm_I32toF32_2D(\n\ }\n\ "; /* end of instance_normalization_i32_cl*/ -static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_meanvari_U8(\n\ - __read_only image2d_array_t input,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int width,\n\ - int height\n\ +static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_sums_U8(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int width,\n\ + int height\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -51387,9 +48861,8 @@ static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_ \n\ int4 coord = (int4)(gidx, 0, gidz, 0);\n\ uint4 data;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - float e2InScale = input_scale * input_scale;\n\ + float2 sum_x_x2 = 0;\n\ + int2 _sum_x_x2 = 0;\n\ \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ @@ -51400,14 +48873,13 @@ static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_ {\n\ data = read_imageui(input, coord);\n\ coord.y++;\n\ - tmpSum += data.x;\n\ - tmpSqr += data.x * data.x;\n\ + _sum_x_x2.x = _sum_x_x2.x + data.x;\n\ + _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\ }\n\ - sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ - sum = (tmpSum - height * input_zp) * input_scale;\n\ + sum_x_x2 = convert_float2(_sum_x_x2);\n\ }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ + lcl_sum[lidx] = sum_x_x2.x;\n\ + lcl_sqr[lidx] = sum_x_x2.y;\n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ @@ -51417,7 +48889,7 @@ static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_ __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ \n\ - sum = 0; sqr = 0;\n\ + float sum = 0, sqr = 0;\n\ for(int i = 0; i < 4; i++)\n\ {\n\ sum += dot(tmp_sum[i], one);\n\ @@ -51433,16 +48905,13 @@ static const char instance_normalization_u8_cl[] = "__kernel void instance_norm_ }\n\ }\n\ \n\ -__kernel void instance_norm_meanvari_U8_2D(\n\ - __read_only image2d_t input,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int width,\n\ - int height\n\ +__kernel void instance_norm_sums_U8_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int width,\n\ + int height\n\ )\n\ {\n\ int gidx = get_global_id(0);\n\ @@ -51452,9 +48921,8 @@ __kernel void instance_norm_meanvari_U8_2D(\n\ \n\ int2 coord = (int2)(gidx, gidy);\n\ uint4 data;\n\ - float sum = 0, sqr = 0;\n\ - int tmpSum = 0, tmpSqr = 0;\n\ - float e2InScale = input_scale * input_scale;\n\ + float2 sum_x_x2 = 0;\n\ + int2 _sum_x_x2 = 0;\n\ \n\ __local float lcl_sum[16];\n\ __local float lcl_sqr[16];\n\ @@ -51466,14 +48934,13 @@ __kernel void instance_norm_meanvari_U8_2D(\n\ {\n\ data = read_imageui(input, coord);\n\ coord.y++;\n\ - tmpSum += data.x;\n\ - tmpSqr += data.x * data.x;\n\ + _sum_x_x2.x = _sum_x_x2.x + data.x;\n\ + _sum_x_x2.y = _sum_x_x2.y + data.x * data.x;\n\ }\n\ - sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ - sum = (tmpSum - height * input_zp) * input_scale;\n\ + sum_x_x2 = convert_float2(_sum_x_x2);\n\ }\n\ - lcl_sum[lidx] = sum;\n\ - lcl_sqr[lidx] = sqr;\n\ + lcl_sum[lidx] = sum_x_x2.x;\n\ + lcl_sqr[lidx] = sum_x_x2.y;\n\ barrier(CLK_LOCAL_MEM_FENCE);\n\ \n\ int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\ @@ -51483,7 +48950,7 @@ __kernel void instance_norm_meanvari_U8_2D(\n\ __local float4* tmp_sum = (__local float4*)lcl_sum;\n\ __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\ \n\ - sum = 0; sqr = 0;\n\ + float sum = 0, sqr = 0;\n\ for(int i = 0; i < 4; i++)\n\ {\n\ sum += dot(tmp_sum[i], one);\n\ @@ -51500,23 +48967,19 @@ __kernel void instance_norm_meanvari_U8_2D(\n\ }\n\ \n\ __kernel void instance_norm_U8toU8(\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_array_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -51527,7 +48990,6 @@ __kernel void instance_norm_U8toU8(\n\ float4 beta = read_imagef(bias, coord_para.yx);\n\ float4 mean_vari = (float4)(0);\n\ float scale_vari, bias_val;\n\ - float scale_inOut = input_scale * output_scale;\n\ \n\ for(int i = 0; i < group_num; i++)\n\ {\n\ @@ -51536,19 +48998,18 @@ __kernel void instance_norm_U8toU8(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = gamma.s0 * mean_vari.s1;\n\ - float alpha = scale_inOut * scale_vari;\n\ + float alpha = output_scale * scale_vari;\n\ bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ \n\ uint4 data, dst;\n\ for(coord.y = 0; coord.y < height;coord.y++)\n\ {\n\ data = read_imageui(input, coord);\n\ - data.x -= input_zp;\n\ \n\ float4 norm;\n\ norm.x = data.x * alpha + bias_val;\n\ @@ -51558,23 +49019,19 @@ __kernel void instance_norm_U8toU8(\n\ }\n\ \n\ __kernel void instance_norm_U8toU8_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -51587,7 +49044,6 @@ __kernel void instance_norm_U8toU8_2D(\n\ float4 beta = read_imagef(bias, coord_para.yx);\n\ float4 mean_vari = (float4)(0);\n\ float scale_vari, bias_val;\n\ - float scale_inOut = input_scale * output_scale;\n\ \n\ for(int i = 0; i < group_num; i++)\n\ {\n\ @@ -51596,19 +49052,18 @@ __kernel void instance_norm_U8toU8_2D(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = gamma.s0 * mean_vari.s1;\n\ - float alpha = scale_inOut * scale_vari;\n\ + float alpha = output_scale * scale_vari;\n\ bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ \n\ uint4 data, dst;\n\ for(; coord.y < endH; coord.y++)\n\ {\n\ data = read_imageui(input, coord);\n\ - data.x -= input_zp;\n\ \n\ float4 norm;\n\ norm.x = data.x * alpha + bias_val;\n\ @@ -51618,23 +49073,19 @@ __kernel void instance_norm_U8toU8_2D(\n\ }\n\ \n\ __kernel void instance_norm_U8toF16(\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_array_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_array_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -51645,7 +49096,6 @@ __kernel void instance_norm_U8toF16(\n\ float4 beta = read_imagef(bias, coord_para.yx);\n\ float4 mean_vari = (float4)(0);\n\ float scale_vari, bias_val;\n\ - float scale_inOut = input_scale * output_scale;\n\ \n\ for(int i = 0; i < group_num; i++)\n\ {\n\ @@ -51654,19 +49104,18 @@ __kernel void instance_norm_U8toF16(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = gamma.s0 * mean_vari.s1;\n\ - float alpha = scale_inOut * scale_vari;\n\ + float alpha = output_scale * scale_vari;\n\ bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ \n\ uint4 data;\n\ for(coord.y = 0; coord.y < height;coord.y++)\n\ {\n\ data = read_imageui(input, coord);\n\ - data.x -= input_zp;\n\ \n\ float4 norm;\n\ norm.x = data.x * alpha + bias_val;\n\ @@ -51675,23 +49124,19 @@ __kernel void instance_norm_U8toF16(\n\ }\n\ \n\ __kernel void instance_norm_U8toF16_2D(\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t bias,\n\ - __read_only image2d_t scale,\n\ - __read_only image2d_t meanVari,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - int rsFlg,\n\ - int input_zp,\n\ - float input_scale,\n\ - float input_fl,\n\ - int output_zp,\n\ - float output_scale,\n\ - float output_fl,\n\ - int width,\n\ - int height,\n\ - float dim_ratio,\n\ - int group_num\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t bias,\n\ + __read_only image2d_t scale,\n\ + __read_only image2d_t meanVari,\n\ + __write_only image2d_t output,\n\ + float eps,\n\ + int rsFlg,\n\ + int output_zp,\n\ + float output_scale,\n\ + int width,\n\ + int height,\n\ + float inv_multiplier,\n\ + int group_num\n\ )\n\ {\n\ int gidz = get_global_id(1);\n\ @@ -51704,7 +49149,6 @@ __kernel void instance_norm_U8toF16_2D(\n\ float4 beta = read_imagef(bias, coord_para.yx);\n\ float4 mean_vari = (float4)(0);\n\ float scale_vari, bias_val;\n\ - float scale_inOut = input_scale * output_scale;\n\ \n\ for(int i = 0; i < group_num; i++)\n\ {\n\ @@ -51713,19 +49157,18 @@ __kernel void instance_norm_U8toF16_2D(\n\ mean_vari.y += read_imagef(meanVari, coord_para.xy).x;\n\ coord_para.x+=3;\n\ }\n\ - mean_vari *= dim_ratio;\n\ + mean_vari *= inv_multiplier;\n\ mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\ mean_vari.s1 = rsqrt(mean_vari.s1);\n\ \n\ scale_vari = gamma.s0 * mean_vari.s1;\n\ - float alpha = scale_inOut * scale_vari;\n\ + float alpha = output_scale * scale_vari;\n\ bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\ \n\ uint4 data;\n\ for(; coord.y < endH; coord.y++)\n\ {\n\ data = read_imageui(input, coord);\n\ - data.x -= input_zp;\n\ \n\ float4 norm;\n\ norm.x = data.x * alpha + bias_val;\n\ @@ -56132,6 +53575,391 @@ __kernel void maximum_I32I32toI32_2D\n\ }\n\ "; /* end of maximum_cl*/ +static const char maxpoolwithargmax_cl[] = "#define FP32_MIN -3.4e38\n\ +#define I32_MIN -2147483647\n\ +\n\ +__kernel void maxpoolwithargmax_F32toF32_I32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t argmax,\n\ + int ksize_x, int ksize_y, int stride_x, int stride_y,\n\ + int pad_left, int pad_top, int width, int height,\n\ + float scale, float tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\ + int4 coord_in = coord_out;\n\ +\n\ + int hstart = gidy * stride_y - pad_top;\n\ + int wstart = gidx * stride_x - pad_left;\n\ + int hend = min(hstart + ksize_y, height);\n\ + int wend = min(wstart + ksize_x, width);\n\ + int h, w;\n\ + int4 index_max = (int4)(0);\n\ + float value_max = FP32_MIN;\n\ + float4 dst = (float4)(0);\n\ +\n\ + hstart = max(hstart, 0);\n\ + wstart = max(wstart, 0);\n\ + int2 coord_max = (int2)(wstart, hstart);\n\ + for (h = hstart; h < hend; ++ h)\n\ + {\n\ + for (w = wstart; w < wend; ++ w)\n\ + {\n\ + coord_in.xy = (int2)(w, h);\n\ + float4 data = read_imagef(input, coord_in);\n\ +\n\ + if (data.x > value_max)\n\ + {\n\ + value_max = data.x;\n\ + coord_max = coord_in.xy;\n\ + }\n\ + }\n\ + }\n\ +\n\ + index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;\n\ + dst.x = value_max;\n\ + write_imagef(output, coord_out, dst);\n\ + write_imagei(argmax, coord_out, index_max);\n\ +}\n\ +\n\ +__kernel void maxpoolwithargmax_BF16toBF16_I32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t argmax,\n\ + int ksize_x, int ksize_y, int stride_x, int stride_y,\n\ + int pad_left, int pad_top, int width, int height,\n\ + float scale, float tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\ + int4 coord_in = coord_out;\n\ +\n\ + int hstart = gidy * stride_y - pad_top;\n\ + int wstart = gidx * stride_x - pad_left;\n\ + int hend = min(hstart + ksize_y, height);\n\ + int wend = min(wstart + ksize_x, width);\n\ + int h, w;\n\ + int4 index_max = (int4)(0);\n\ + float value_max = FP32_MIN;\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + hstart = max(hstart, 0);\n\ + wstart = max(wstart, 0);\n\ + int2 coord_max = (int2)(wstart, hstart);\n\ + for (h = hstart; h < hend; ++ h)\n\ + {\n\ + for (w = wstart; w < wend; ++ w)\n\ + {\n\ + coord_in.xy = (int2)(w, h);\n\ + uint4 src = read_imageui(input, coord_in);\n\ + src = src << 16;\n\ + float4 data;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + if (data.x > value_max)\n\ + {\n\ + value_max = data.x;\n\ + coord_max = coord_in.xy;\n\ + }\n\ + }\n\ + }\n\ +\n\ + index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;\n\ + _viv_asm(COPY, dst, value_max, 4);\n\ + dst.x = dst.x >> 16;\n\ + write_imageui(output, coord_out, dst);\n\ + write_imagei(argmax, coord_out, index_max);\n\ +}\n\ +\n\ +__kernel void maxpoolwithargmax_U32toU32_I32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t argmax,\n\ + int ksize_x, int ksize_y, int stride_x, int stride_y,\n\ + int pad_left, int pad_top, int width, int height,\n\ + float scale, float tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\ + int4 coord_in = coord_out;\n\ +\n\ + int hstart = gidy * stride_y - pad_top;\n\ + int wstart = gidx * stride_x - pad_left;\n\ + int hend = min(hstart + ksize_y, height);\n\ + int wend = min(wstart + ksize_x, width);\n\ + int h, w;\n\ + int4 index_max = (int4)(0);\n\ + uint value_max = 0;\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + hstart = max(hstart, 0);\n\ + wstart = max(wstart, 0);\n\ +\n\ + int2 coord_max = (int2)(wstart, hstart);\n\ + for (h = hstart; h < hend; ++ h)\n\ + {\n\ + for (w = wstart; w < wend; ++ w)\n\ + {\n\ + coord_in.xy = (int2)(w, h);\n\ + uint4 data = read_imageui(input, coord_in);\n\ +\n\ + if (data.x > value_max)\n\ + {\n\ + value_max = data.x;\n\ + coord_max = coord_in.xy;\n\ + }\n\ + }\n\ + }\n\ +\n\ + index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;\n\ + dst.x = convert_uint(convert_float(value_max) * scale + tail);\n\ + write_imageui(output, coord_out, dst);\n\ + write_imagei(argmax, coord_out, index_max);\n\ +}\n\ +\n\ +__kernel void maxpoolwithargmax_I32toI32_I32(\n\ + __read_only image2d_array_t input,\n\ + __write_only image2d_array_t output,\n\ + __write_only image2d_array_t argmax,\n\ + int ksize_x, int ksize_y, int stride_x, int stride_y,\n\ + int pad_left, int pad_top, int width, int height,\n\ + float scale, float tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\ + int4 coord_in = coord_out;\n\ +\n\ + int hstart = gidy * stride_y - pad_top;\n\ + int wstart = gidx * stride_x - pad_left;\n\ + int hend = min(hstart + ksize_y, height);\n\ + int wend = min(wstart + ksize_x, width);\n\ + int h, w;\n\ + int4 index_max = (int4)(0);\n\ + int value_max = I32_MIN;\n\ + int4 dst = (int4)(0);\n\ +\n\ + hstart = max(hstart, 0);\n\ + wstart = max(wstart, 0);\n\ + int2 coord_max = (int2)(wstart, hstart);\n\ + for (h = hstart; h < hend; ++ h)\n\ + {\n\ + for (w = wstart; w < wend; ++ w)\n\ + {\n\ + coord_in.xy = (int2)(w, h);\n\ + int4 data = read_imagei(input, coord_in);\n\ +\n\ + if (data.x > value_max)\n\ + {\n\ + value_max = data.x;\n\ + coord_max = coord_in.xy;\n\ + }\n\ + }\n\ + }\n\ +\n\ + index_max.x = coord_max.x + coord_max.y * width + get_global_id(2) * width * height;\n\ + dst.x = convert_int(convert_float(value_max) * scale + tail);\n\ + write_imagei(output, coord_out, dst);\n\ + write_imagei(argmax, coord_out, index_max);\n\ +}\n\ +"; /* end of maxpoolwithargmax_cl*/ + +static const char maxpoolwithargmax_2d_cl[] = "#define FP32_MIN -3.4e38\n\ +#define I32_MIN -2147483647\n\ +\n\ +__kernel void maxpoolwithargmax_F32toF32_I32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t argmax,\n\ + int ksize_x, int ksize_y, int stride_x, int stride_y,\n\ + int pad_left, int pad_top, int width, int height,\n\ + float scale, float tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int2 coord_out = (int2)(gidx, gidy);\n\ + int2 coord_in = coord_out;\n\ +\n\ + int hstart = gidy * stride_y - pad_top;\n\ + int wstart = gidx * stride_x - pad_left;\n\ + int hend = min(hstart + ksize_y, height);\n\ + int wend = min(wstart + ksize_x, width);\n\ + int h, w;\n\ + int4 index_max = (int4)(0);\n\ + float value_max = FP32_MIN;\n\ + float4 dst = (float4)(0);\n\ +\n\ + hstart = max(hstart, 0);\n\ + wstart = max(wstart, 0);\n\ + int2 coord_max = (int2)(wstart, hstart);\n\ + for (h = hstart; h < hend; ++ h)\n\ + {\n\ + for (w = wstart; w < wend; ++ w)\n\ + {\n\ + coord_in.xy = (int2)(w, h);\n\ + float4 data = read_imagef(input, coord_in);\n\ +\n\ + if (data.x > value_max)\n\ + {\n\ + value_max = data.x;\n\ + coord_max = coord_in;\n\ + }\n\ + }\n\ + }\n\ +\n\ + index_max.x = coord_max.x + coord_max.y * width;\n\ + dst.x = value_max;\n\ + write_imagef(output, coord_out, dst);\n\ + write_imagei(argmax, coord_out, index_max);\n\ +}\n\ +\n\ +__kernel void maxpoolwithargmax_BF16toBF16_I32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t argmax,\n\ + int ksize_x, int ksize_y, int stride_x, int stride_y,\n\ + int pad_left, int pad_top, int width, int height,\n\ + float scale, float tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int2 coord_out = (int2)(gidx, gidy);\n\ + int2 coord_in = coord_out;\n\ +\n\ + int hstart = gidy * stride_y - pad_top;\n\ + int wstart = gidx * stride_x - pad_left;\n\ + int hend = min(hstart + ksize_y, height);\n\ + int wend = min(wstart + ksize_x, width);\n\ + int h, w;\n\ + int4 index_max = (int4)(0);\n\ + float value_max = FP32_MIN;\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + hstart = max(hstart, 0);\n\ + wstart = max(wstart, 0);\n\ + int2 coord_max = (int2)(wstart, hstart);\n\ + for (h = hstart; h < hend; ++ h)\n\ + {\n\ + for (w = wstart; w < wend; ++ w)\n\ + {\n\ + coord_in.xy = (int2)(w, h);\n\ + uint4 src = read_imageui(input, coord_in);\n\ + src = src << 16;\n\ + float4 data;\n\ + _viv_asm(COPY, data, src, 16);\n\ +\n\ + if (data.x > value_max)\n\ + {\n\ + value_max = data.x;\n\ + coord_max = coord_in;\n\ + }\n\ + }\n\ + }\n\ +\n\ + index_max.x = coord_max.x + coord_max.y * width;\n\ + _viv_asm(COPY, dst, value_max, 4);\n\ + dst.x = dst.x >> 16;\n\ + write_imageui(output, coord_out, dst);\n\ + write_imagei(argmax, coord_out, index_max);\n\ +}\n\ +\n\ +__kernel void maxpoolwithargmax_U32toU32_I32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t argmax,\n\ + int ksize_x, int ksize_y, int stride_x, int stride_y,\n\ + int pad_left, int pad_top, int width, int height,\n\ + float scale, float tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int2 coord_out = (int2)(gidx, gidy);\n\ + int2 coord_in = coord_out;\n\ +\n\ + int hstart = gidy * stride_y - pad_top;\n\ + int wstart = gidx * stride_x - pad_left;\n\ + int hend = min(hstart + ksize_y, height);\n\ + int wend = min(wstart + ksize_x, width);\n\ + int h, w;\n\ + int4 index_max = (int4)(0);\n\ + uint value_max = 0;\n\ + uint4 dst = (uint4)(0);\n\ +\n\ + hstart = max(hstart, 0);\n\ + wstart = max(wstart, 0);\n\ + int2 coord_max = (int2)(wstart, hstart);\n\ + for (h = hstart; h < hend; ++ h)\n\ + {\n\ + for (w = wstart; w < wend; ++ w)\n\ + {\n\ + coord_in.xy = (int2)(w, h);\n\ + uint4 data = read_imageui(input, coord_in);\n\ +\n\ + if (data.x > value_max)\n\ + {\n\ + value_max = data.x;\n\ + coord_max = coord_in;\n\ + }\n\ + }\n\ + }\n\ +\n\ + index_max.x = coord_max.x + coord_max.y * width;\n\ + dst.x = convert_uint(convert_float(value_max) * scale + tail);\n\ + write_imageui(output, coord_out, dst);\n\ + write_imagei(argmax, coord_out, index_max);\n\ +}\n\ +\n\ +__kernel void maxpoolwithargmax_I32toI32_I32_2D(\n\ + __read_only image2d_t input,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t argmax,\n\ + int ksize_x, int ksize_y, int stride_x, int stride_y,\n\ + int pad_left, int pad_top, int width, int height,\n\ + float scale, float tail)\n\ +{\n\ + int gidx = get_global_id(0);\n\ + int gidy = get_global_id(1);\n\ + int2 coord_out = (int2)(gidx, gidy);\n\ + int2 coord_in = coord_out;\n\ +\n\ + int hstart = gidy * stride_y - pad_top;\n\ + int wstart = gidx * stride_x - pad_left;\n\ + int hend = min(hstart + ksize_y, height);\n\ + int wend = min(wstart + ksize_x, width);\n\ + int h, w;\n\ + int4 index_max = (int4)(0);\n\ + int value_max = I32_MIN;\n\ + int4 dst = (int4)(0);\n\ +\n\ + hstart = max(hstart, 0);\n\ + wstart = max(wstart, 0);\n\ + int2 coord_max = (int2)(wstart, hstart);\n\ + for (h = hstart; h < hend; ++ h)\n\ + {\n\ + for (w = wstart; w < wend; ++ w)\n\ + {\n\ + coord_in.xy = (int2)(w, h);\n\ + int4 data = read_imagei(input, coord_in);\n\ +\n\ + if (data.x > value_max)\n\ + {\n\ + value_max = data.x;\n\ + coord_max = coord_in;\n\ + }\n\ + }\n\ + }\n\ +\n\ + index_max.x = coord_max.x + coord_max.y * width;\n\ + dst.x = convert_int(convert_float(value_max) * scale + tail);\n\ + write_imagei(output, coord_out, dst);\n\ + write_imagei(argmax, coord_out, index_max);\n\ +}\n\ +"; /* end of maxpoolwithargmax_2d_cl*/ + static const char minimum_cl[] = "__kernel void minimum_FP32FP32toFP32\n\ (\n\ __read_only image2d_array_t input0,\n\ @@ -56290,6 +54118,314 @@ __kernel void minimum_I32I32toI32_2D\n\ }\n\ "; /* end of minimum_cl*/ +static const char mod_cl[] = "__kernel void mod_F32F32toF32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + float4 src0;\n\ + float4 src1;\n\ + READ_IMAGEF_2DARRAY(src0, input, coord);\n\ + READ_IMAGEF_2DARRAY(src1, input1, coord);\n\ + float4 dst = fmod(src0, src1);\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void mod_F32F32toF32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + float4 src0 = read_imagef(input, coord);\n\ + float4 src1 = read_imagef(input1, coord);\n\ + float4 dst = fmod(src0, src1);\n\ + write_imagef(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void mod_I32I32toI32\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 src0;\n\ + int4 src1;\n\ + READ_IMAGEI_2DARRAY(src0, input, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ + float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + float4 out;\n\ + if (isfmod)\n\ + {\n\ + out = fmod(in0, in1) * outputScale + outputTail;\n\ + }\n\ + else\n\ + {\n\ + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\ + }\n\ + int4 dst = convert_int4(out);\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void mod_I32I32toI32_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 src0 = read_imagei(input, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ + float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + float4 out;\n\ + if (isfmod)\n\ + {\n\ + out = fmod(in0, in1) * outputScale + outputTail;\n\ + }\n\ + else\n\ + {\n\ + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\ + }\n\ + int4 dst = convert_int4(out);\n\ + write_imagei(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void mod_I32I32toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + int4 src0;\n\ + int4 src1;\n\ + READ_IMAGEI_2DARRAY(src0, input, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ + float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + float4 out;\n\ + if (isfmod)\n\ + {\n\ + out = fmod(in0, in1) * outputScale + outputTail;\n\ + }\n\ + else\n\ + {\n\ + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\ + }\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void mod_I32I32toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + int4 src0 = read_imagei(input, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ + float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + float4 out;\n\ + if (isfmod)\n\ + {\n\ + out = fmod(in0, in1) * outputScale + outputTail;\n\ + }\n\ + else\n\ + {\n\ + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\ + }\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void mod_U8U8toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + uint4 src0, src1;\n\ + float4 in0, in1, out;\n\ + READ_IMAGEUI_2DARRAY(src0, input, coord);\n\ + READ_IMAGEUI_2DARRAY(src1, input1, coord);\n\ + in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + if (isfmod)\n\ + {\n\ + out = fmod(in0, in1) * outputScale + outputTail;\n\ + }\n\ + else\n\ + {\n\ + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\ + }\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void mod_U8U8toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + uint4 src0 = read_imageui(input, coord);\n\ + uint4 src1 = read_imageui(input1, coord);\n\ + float4 in0, in1, out;\n\ + in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + in1 = convert_float4(src1) * input1Scale + input1Tail;\n\ + if (isfmod)\n\ + {\n\ + out = fmod(in0, in1) * outputScale + outputTail;\n\ + }\n\ + else\n\ + {\n\ + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\ + }\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void mod_U8I32toU8\n\ + (\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_array_t input1,\n\ + __write_only image2d_array_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ + uint4 src0;\n\ + int4 src1;\n\ + float4 in0, in1, out;\n\ + READ_IMAGEUI_2DARRAY(src0, input, coord);\n\ + READ_IMAGEI_2DARRAY(src1, input1, coord);\n\ + in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + in1 = convert_float4(src1);\n\ + if (isfmod)\n\ + {\n\ + out = fmod(in0, in1) * outputScale + outputTail;\n\ + }\n\ + else\n\ + {\n\ + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\ + }\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +\n\ +__kernel void mod_U8I32toU8_2D\n\ + (\n\ + __read_only image2d_t input,\n\ + __read_only image2d_t input1,\n\ + __write_only image2d_t output,\n\ + int isfmod,\n\ + float input0Scale,\n\ + float input0Tail,\n\ + float input1Scale,\n\ + float input1Tail,\n\ + float outputScale,\n\ + float outputTail\n\ + )\n\ +{\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ + uint4 src0 = read_imageui(input, coord);\n\ + int4 src1 = read_imagei(input1, coord);\n\ + float4 in0, in1, out;\n\ + in0 = convert_float4(src0) * input0Scale + input0Tail;\n\ + in1 = convert_float4(src1);\n\ + if (isfmod)\n\ + {\n\ + out = fmod(in0, in1) * outputScale + outputTail;\n\ + }\n\ + else\n\ + {\n\ + out = (in0 - in1 * floor(in0 / in1)) * outputScale + outputTail;\n\ + }\n\ + uint4 dst = convert_uint4(out);\n\ + write_imageui(output, coord, dst);\n\ +}\n\ +"; /* end of mod_cl*/ + static const char moments_axis0_cl[] = "__kernel void moments_axis0_U8toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_t output_mean,\n\ @@ -60201,21 +58337,25 @@ static const char roi_align_cl[] = "inline float roi_align_1x1\n\ \n\ \n\ #define EPS_GRID 0.00001f\n\ -__kernel void roi_align_F32toF32\n\ +__kernel void roi_align_F32_F32toF32\n\ (\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_t rois,\n\ - __read_only image2d_t n_rois,\n\ - __write_only image2d_array_t output,\n\ - float spatial_x_scale,\n\ - float spatial_y_scale,\n\ - float in_width,\n\ - float in_height,\n\ - float rcp_of_out_width,\n\ - float rcp_of_out_height,\n\ - float sampling_x_ratio,\n\ - float sampling_y_ratio,\n\ - int depth\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t rois,\n\ + __read_only image2d_t n_rois,\n\ + __write_only image2d_array_t output,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_zp,\n\ + float spatial_x_scale,\n\ + float spatial_y_scale,\n\ + float in_width,\n\ + float in_height,\n\ + float rcp_of_out_width,\n\ + float rcp_of_out_height,\n\ + float sampling_x_ratio,\n\ + float sampling_y_ratio,\n\ + int depth\n\ )\n\ {\n\ int px = get_global_id(0);\n\ @@ -60261,6 +58401,128 @@ __kernel void roi_align_F32toF32\n\ \n\ write_imagef(output, (int4)(px, py, kz1, 0), interp);\n\ }\n\ +}\n\ +\n\ +inline float roi_align_1x1_U8toF32\n\ +(\n\ + __read_only image2d_array_t input,\n\ + float input_scale,\n\ + float input_tail,\n\ + float2 region_start,\n\ + float2 region_end,\n\ + float2 bin_size,\n\ + int2 grid_size,\n\ + float2 rcp_of_grid_size,\n\ + int pz\n\ +)\n\ +{\n\ + float sum = 0;\n\ +\n\ + for(int iy = 0; iy < grid_size.y; ++iy)\n\ + {\n\ + for(int ix = 0; ix < grid_size.x; ++ix)\n\ + {\n\ + float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);\n\ + float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;\n\ +\n\ + int2 xy_low = convert_int2(pos);\n\ + int2 xy_high = xy_low + 1;\n\ +\n\ + float ly = pos.y - xy_low.y;\n\ + float lx = pos.x - xy_low.x;\n\ + float hy = 1.0f - ly;\n\ + float hx = 1.0f - lx;\n\ +\n\ + float w1 = hy * hx;\n\ + float w2 = hy * lx;\n\ + float w3 = ly * hx;\n\ + float w4 = ly * lx;\n\ +\n\ + uint4 data;\n\ + data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;\n\ + data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;\n\ + data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;\n\ + data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;\n\ +\n\ + float4 value = convert_float4(data) * input_scale + input_tail;\n\ +\n\ + sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w;\n\ + }\n\ + }\n\ +\n\ + return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);\n\ +}\n\ +\n\ +__kernel void roi_align_U8_U16toU8\n\ +(\n\ + __read_only image2d_array_t input,\n\ + __read_only image2d_t rois,\n\ + __read_only image2d_t n_rois,\n\ + __write_only image2d_array_t output,\n\ + float input_scale,\n\ + float input_tail,\n\ + float output_scale,\n\ + float output_zp,\n\ + float spatial_x_scale,\n\ + float spatial_y_scale,\n\ + float in_width,\n\ + float in_height,\n\ + float rcp_of_out_width,\n\ + float rcp_of_out_height,\n\ + float sampling_x_ratio,\n\ + float sampling_y_ratio,\n\ + int depth\n\ +)\n\ +{\n\ + int px = get_global_id(0);\n\ + int py = get_global_id(1);\n\ + int pw = get_global_id(2);\n\ +\n\ + int roi_batch = read_imagei(n_rois, (int2)(pw, 0)).x;\n\ + float4 roi_x = convert_float4(read_imageui(rois, (int2)(0, pw)));\n\ + float4 roi_y = convert_float4(read_imageui(rois, (int2)(1, pw)));\n\ + float4 roi_z = convert_float4(read_imageui(rois, (int2)(2, pw)));\n\ + float4 roi_w = convert_float4(read_imageui(rois, (int2)(3, pw)));\n\ + float4 roi = (float4)(roi_x.x, roi_y.x, roi_z.x, roi_w.x);\n\ +\n\ + float4 roi_anchor = roi * (float4)(spatial_x_scale, spatial_y_scale, spatial_x_scale, spatial_y_scale);\n\ + float2 roi_dims = fmax(roi_anchor.zw - roi_anchor.xy, 1.0f);\n\ +\n\ + float2 spatial_indx = (float2)(px, py);\n\ + float2 pooled_dims = (float2)(rcp_of_out_width, rcp_of_out_height);\n\ + float2 max_spatial_dims = (float2)(in_width, in_height);\n\ +\n\ + float2 bin_size = roi_dims * pooled_dims;\n\ + float2 region_start = spatial_indx * bin_size + roi_anchor.xy;\n\ + float2 region_end = region_start + bin_size;\n\ +\n\ + float2 roi_bin_grid = (float2)(sampling_x_ratio, sampling_y_ratio);\n\ +\n\ + roi_bin_grid = roi_bin_grid == 0 ? ceil(bin_size - EPS_GRID) : roi_bin_grid;\n\ +\n\ + int kz = roi_batch * depth;\n\ + float2 rcp_of_grid_size = 1.0f / roi_bin_grid;\n\ + int2 grid_size_xy = convert_int2(roi_bin_grid);\n\ + float4 interp;\n\ + int kz1 = pw * depth;\n\ + for (int pz = 0; pz < depth; pz ++, kz ++, kz1 ++)\n\ + {\n\ + interp.x = roi_align_1x1_U8toF32( input,\n\ + input_scale,\n\ + input_tail,\n\ + region_start,\n\ + region_end,\n\ + bin_size,\n\ + grid_size_xy,\n\ + rcp_of_grid_size,\n\ + kz);\n\ +\n\ + uint4 dst;\n\ + interp.x = interp.x * output_scale + output_zp;\n\ + interp.x = interp.x < 255 ? interp.x : 255;\n\ + dst.x = convert_uint_rte(interp.x);\n\ + write_imageui(output, (int4)(px, py, kz1, 0), dst.xxxx);\n\ + }\n\ }"; /* end of roi_align_cl*/ static const char scatter_nd_cl[] = "__kernel void scatter_nd_U32toU32_1D(\n\ @@ -61471,6 +59733,334 @@ TOPK_I32(1 << 5, 5)\n\ TOPK_I32(1 << 6, 6)\n\ "; /* end of topk_cl*/ +static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X (32)\n\ +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\ + (\n\ + __read_only image2d_t input,\n\ + image2d_t input_t,\n\ + image2d_t indices_t,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t indices,\n\ + int width\n\ + )\n\ + {\n\ + uint lid = get_local_id(0);\n\ + uint work_group_size = get_local_size(0);\n\ + uint offset = 0;\n\ +\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\ + {\n\ + float4 data = read_imagef(input, coord.xy);\n\ +\n\ + write_imagef(input_t, coord.xy, data);\n\ + write_imagei(indices_t, coord.xy, coord.xxxx);\n\ + }\n\ +\n\ + __local int sorted[1];\n\ + int width_minus_one = width - 1;\n\ + int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\ + num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\ +\n\ + int x_start = lid * num_pixels_per_thread;\n\ + int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\ +\n\ + sorted[0] = 0;\n\ +\n\ + while (1)\n\ + {\n\ + if (lid == 0)\n\ + {\n\ + *sorted = 0;\n\ + }\n\ + int swapped = 0;\n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ +\n\ + // odd-even\n\ + coord.x = x_start;\n\ + coord.z = x_start + 1;\n\ + for (; coord.x < x_end; )\n\ + {\n\ + float4 left = read_imagef(input_t, coord.xy);\n\ + float4 right = read_imagef(input_t, coord.zy);\n\ +\n\ + if (left.x < right.x)\n\ + {\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ + swapped = 1;\n\ +\n\ + write_imagef(input_t, coord.xy, right);\n\ + write_imagef(input_t, coord.zy, left);\n\ +\n\ + write_imagei(indices_t, coord.xy, r_index);\n\ + write_imagei(indices_t, coord.zy, l_index);\n\ + }\n\ +\n\ + coord.xz = coord.xz + 2;\n\ + }\n\ +\n\ + // even-odd\n\ + coord.x = x_start + 1;\n\ + coord.z = x_start + 2;\n\ + for (; coord.x < x_end; )\n\ + {\n\ + float4 left = read_imagef(input_t, coord.xy);\n\ + float4 right = read_imagef(input_t, coord.zy);\n\ +\n\ + if (left.x < right.x)\n\ + {\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ + swapped = 1;\n\ +\n\ + write_imagef(input_t, coord.xy, right);\n\ + write_imagef(input_t, coord.zy, left);\n\ +\n\ + write_imagei(indices_t, coord.xy, r_index);\n\ + write_imagei(indices_t, coord.zy, l_index);\n\ + }\n\ +\n\ + coord.xz = coord.xz + 2;\n\ + }\n\ +\n\ + atomic_add(sorted, swapped);\n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ +\n\ + if (*sorted == 0)\n\ + break;\n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ + }\n\ +\n\ + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\ + {\n\ + float4 data = read_imagef(input_t, coord.xy);\n\ + int4 index = read_imagei(indices_t, coord.xy);\n\ +\n\ + write_imagef(output, coord.xy, data);\n\ + write_imagei(indices, coord.xy, index);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_U32toU32_I32\n\ + (\n\ + __read_only image2d_t input,\n\ + image2d_t input_t,\n\ + image2d_t indices_t,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t indices,\n\ + int width\n\ + )\n\ + {\n\ + uint lid = get_local_id(0);\n\ + uint work_group_size = get_local_size(0);\n\ + uint offset = 0;\n\ +\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\ + {\n\ + uint4 data = read_imageui(input, coord.xy);\n\ +\n\ + write_imageui(input_t, coord.xy, data);\n\ + write_imagei(indices_t, coord.xy, coord.xxxx);\n\ + }\n\ +\n\ + __local int sorted[1];\n\ + int width_minus_one = width - 1;\n\ + int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\ + num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\ +\n\ + int x_start = lid * num_pixels_per_thread;\n\ + int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\ +\n\ + sorted[0] = 0;\n\ +\n\ + while (1)\n\ + {\n\ + if (lid == 0)\n\ + {\n\ + *sorted = 0;\n\ + }\n\ + int swapped = 0;\n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ +\n\ + // odd-even\n\ + coord.x = x_start;\n\ + coord.z = x_start + 1;\n\ + for (; coord.x < x_end; )\n\ + {\n\ + uint4 left = read_imageui(input_t, coord.xy);\n\ + uint4 right = read_imageui(input_t, coord.zy);\n\ +\n\ + if (left.x < right.x)\n\ + {\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ + swapped = 1;\n\ +\n\ + write_imageui(input_t, coord.xy, right);\n\ + write_imageui(input_t, coord.zy, left);\n\ +\n\ + write_imagei(indices_t, coord.xy, r_index);\n\ + write_imagei(indices_t, coord.zy, l_index);\n\ + }\n\ +\n\ + coord.xz = coord.xz + 2;\n\ + }\n\ +\n\ + // even-odd\n\ + coord.x = x_start + 1;\n\ + coord.z = x_start + 2;\n\ + for (; coord.x < x_end; )\n\ + {\n\ + uint4 left = read_imageui(input_t, coord.xy);\n\ + uint4 right = read_imageui(input_t, coord.zy);\n\ +\n\ + if (left.x < right.x)\n\ + {\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ + swapped = 1;\n\ +\n\ + write_imageui(input_t, coord.xy, right);\n\ + write_imageui(input_t, coord.zy, left);\n\ +\n\ + write_imagei(indices_t, coord.xy, r_index);\n\ + write_imagei(indices_t, coord.zy, l_index);\n\ + }\n\ +\n\ + coord.xz = coord.xz + 2;\n\ + }\n\ +\n\ + atomic_add(sorted, swapped);\n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ +\n\ + if (*sorted == 0)\n\ + break;\n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ + }\n\ +\n\ + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\ + {\n\ + uint4 data = read_imageui(input_t, coord.xy);\n\ + int4 index = read_imagei(indices_t, coord.xy);\n\ +\n\ + write_imageui(output, coord.xy, data);\n\ + write_imagei(indices, coord.xy, index);\n\ + }\n\ +}\n\ +\n\ +__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_I32toI32_I32\n\ + (\n\ + __read_only image2d_t input,\n\ + image2d_t input_t,\n\ + image2d_t indices_t,\n\ + __write_only image2d_t output,\n\ + __write_only image2d_t indices,\n\ + int width\n\ + )\n\ + {\n\ + uint lid = get_local_id(0);\n\ + uint work_group_size = get_local_size(0);\n\ + uint offset = 0;\n\ +\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\ +\n\ + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\ + {\n\ + int4 data = read_imagei(input, coord.xy);\n\ +\n\ + write_imagei(input_t, coord.xy, data);\n\ + write_imagei(indices_t, coord.xy, coord.xxxx);\n\ + }\n\ +\n\ + __local int sorted[1];\n\ + int width_minus_one = width - 1;\n\ + int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;\n\ + num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);\n\ +\n\ + int x_start = lid * num_pixels_per_thread;\n\ + int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);\n\ +\n\ + sorted[0] = 0;\n\ +\n\ + while (1)\n\ + {\n\ + if (lid == 0)\n\ + {\n\ + *sorted = 0;\n\ + }\n\ + int swapped = 0;\n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ +\n\ + // odd-even\n\ + coord.x = x_start;\n\ + coord.z = x_start + 1;\n\ + for (; coord.x < x_end; )\n\ + {\n\ + int4 left = read_imagei(input_t, coord.xy);\n\ + int4 right = read_imagei(input_t, coord.zy);\n\ +\n\ + if (left.x < right.x)\n\ + {\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ + swapped = 1;\n\ +\n\ + write_imagei(input_t, coord.xy, right);\n\ + write_imagei(input_t, coord.zy, left);\n\ +\n\ + write_imagei(indices_t, coord.xy, r_index);\n\ + write_imagei(indices_t, coord.zy, l_index);\n\ + }\n\ +\n\ + coord.xz = coord.xz + 2;\n\ + }\n\ +\n\ + // even-odd\n\ + coord.x = x_start + 1;\n\ + coord.z = x_start + 2;\n\ + for (; coord.x < x_end; )\n\ + {\n\ + int4 left = read_imagei(input_t, coord.xy);\n\ + int4 right = read_imagei(input_t, coord.zy);\n\ +\n\ + if (left.x < right.x)\n\ + {\n\ + int4 l_index = read_imagei(indices_t, coord.xy);\n\ + int4 r_index = read_imagei(indices_t, coord.zy);\n\ + swapped = 1;\n\ +\n\ + write_imagei(input_t, coord.xy, right);\n\ + write_imagei(input_t, coord.zy, left);\n\ +\n\ + write_imagei(indices_t, coord.xy, r_index);\n\ + write_imagei(indices_t, coord.zy, l_index);\n\ + }\n\ +\n\ + coord.xz = coord.xz + 2;\n\ + }\n\ +\n\ + atomic_add(sorted, swapped);\n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ +\n\ + if (*sorted == 0)\n\ + break;\n\ + barrier(CLK_GLOBAL_MEM_FENCE);\n\ + }\n\ +\n\ + for (coord.x = lid; coord.x < width; coord.x += LOCAL_SIZE_X)\n\ + {\n\ + int4 data = read_imagei(input_t, coord.xy);\n\ + int4 index = read_imagei(indices_t, coord.xy);\n\ +\n\ + write_imagei(output, coord.xy, data);\n\ + write_imagei(indices, coord.xy, index);\n\ + }\n\ +}"; /* end of topk_odd_even_sort_cl*/ + static const char upsample_cl[] = "\n\ #define UPSAMPLE_PROCESS(data_type, read_fun, write_fun) \\\n\ data_type src = 0; \\\n\ @@ -61704,6 +60294,10 @@ static const source_map_t evis_resource[] = {"clip_U8_vx", clip_U8_vx}, {"conv1d_ovxlib_vx", conv1d_ovxlib_vx}, {"conv1d_ovxlib_k1024_vx", conv1d_ovxlib_k1024_vx}, + {"cumsum_vx", cumsum_vx}, + {"cumsum_2d_vx", cumsum_2d_vx}, + {"cumsum_bf16_vx", cumsum_bf16_vx}, + {"cumsum_f16_u8_vx", cumsum_f16_u8_vx}, {"custom_softmax_vx", custom_softmax_vx}, {"custom_warp_affine_vx", custom_warp_affine_vx}, {"custom_warp_perspective_vx", custom_warp_perspective_vx}, @@ -61733,14 +60327,9 @@ static const source_map_t evis_resource[] = {"gather_nd_3d_mix_vx", gather_nd_3d_mix_vx}, {"gather_nd_mix_vx", gather_nd_mix_vx}, {"get_matrix_vx", get_matrix_vx}, - {"group_normalization_f16_vx", group_normalization_f16_vx}, - {"group_normalization_f16_scale_vx", group_normalization_f16_scale_vx}, - {"group_normalization_i16_vx", group_normalization_i16_vx}, - {"group_normalization_i16_scale_vx", group_normalization_i16_scale_vx}, - {"group_normalization_i8_vx", group_normalization_i8_vx}, - {"group_normalization_i8_scale_vx", group_normalization_i8_scale_vx}, - {"group_normalization_u8_vx", group_normalization_u8_vx}, - {"group_normalization_u8_f16_vx", group_normalization_u8_f16_vx}, + {"group_normalization_0_vx", group_normalization_0_vx}, + {"group_normalization_1_vx", group_normalization_1_vx}, + {"group_normalization_2_vx", group_normalization_2_vx}, {"grucell_activation_vx", grucell_activation_vx}, {"grucell_activation_sma_vx", grucell_activation_sma_vx}, {"grucell_activation_z_h_vx", grucell_activation_z_h_vx}, @@ -61749,26 +60338,17 @@ static const source_map_t evis_resource[] = {"grucell_h_times_activation_r_vx", grucell_h_times_activation_r_vx}, {"grucell_reset_after_activation_vx", grucell_reset_after_activation_vx}, {"hswish_vx", hswish_vx}, - {"instance_normalization_f16_vx", instance_normalization_f16_vx}, - {"instance_normalization_i16_vx", instance_normalization_i16_vx}, - {"instance_normalization_i8_vx", instance_normalization_i8_vx}, - {"instance_normalization_scale_f32_vx", instance_normalization_scale_f32_vx}, - {"instance_normalization_scale_f32_bf16_vx", instance_normalization_scale_f32_bf16_vx}, - {"instance_normalization_scale_f32_f16_vx", instance_normalization_scale_f32_f16_vx}, - {"instance_normalization_u8_vx", instance_normalization_u8_vx}, - {"instance_normalization_u8_f16_vx", instance_normalization_u8_f16_vx}, + {"instance_normalization_0_vx", instance_normalization_0_vx}, + {"instance_normalization_1_vx", instance_normalization_1_vx}, + {"instance_normalization_2_vx", instance_normalization_2_vx}, + {"instance_normalization_3_vx", instance_normalization_3_vx}, {"l2normalizescale_axis0_vx", l2normalizescale_axis0_vx}, + {"l2normalizescale_axis0_2d_vx", l2normalizescale_axis0_2d_vx}, {"l2normalizescale_axis1_vx", l2normalizescale_axis1_vx}, - {"layer_normalization_vx", layer_normalization_vx}, - {"layer_normalization_2d_vx", layer_normalization_2d_vx}, - {"layer_normalization_i16_vx", layer_normalization_i16_vx}, - {"layer_normalization_scale_f32_vx", layer_normalization_scale_f32_vx}, - {"layer_normalization_scale_f32_2d_vx", layer_normalization_scale_f32_2d_vx}, - {"layer_normalization_scale_f32_bf16_vx", layer_normalization_scale_f32_bf16_vx}, - {"layer_normalization_u8_f16_vx", layer_normalization_u8_f16_vx}, - {"layer_normalization_wh_f16_vx", layer_normalization_wh_f16_vx}, - {"layer_normalization_wh_i16_vx", layer_normalization_wh_i16_vx}, - {"layer_normalization_wh_u8_vx", layer_normalization_wh_u8_vx}, + {"layer_normalization_0_vx", layer_normalization_0_vx}, + {"layer_normalization_1_vx", layer_normalization_1_vx}, + {"layer_normalization_2_vx", layer_normalization_2_vx}, + {"layer_normalization_3_vx", layer_normalization_3_vx}, {"log_softmax_axis0_vx", log_softmax_axis0_vx}, {"log_softmax_axis0_BF16_vx", log_softmax_axis0_BF16_vx}, {"log_softmax_axis1_vx", log_softmax_axis1_vx}, @@ -61815,6 +60395,7 @@ static const source_map_t evis_resource[] = {"maximum_1_vx", maximum_1_vx}, {"minimum_0_vx", minimum_0_vx}, {"minimum_1_vx", minimum_1_vx}, + {"mod_vx", mod_vx}, {"moments_axis0_vx", moments_axis0_vx}, {"moments_axis01_vx", moments_axis01_vx}, {"moments_axis012_vx", moments_axis012_vx}, @@ -61827,12 +60408,7 @@ static const source_map_t evis_resource[] = {"poolwithargmax_I16_vx", poolwithargmax_I16_vx}, {"poolwithargmax_I8_vx", poolwithargmax_I8_vx}, {"poolwithargmax_U8_vx", poolwithargmax_U8_vx}, - {"pow_fp16_vx", pow_fp16_vx}, - {"pow_fp16_i16_vx", pow_fp16_i16_vx}, - {"pow_fp16_i8_vx", pow_fp16_i8_vx}, - {"pow_i16_vx", pow_i16_vx}, - {"pow_i8_vx", pow_i8_vx}, - {"pow_u8_vx", pow_u8_vx}, + {"pow_vx", pow_vx}, {"pre_process_bgra_vx", pre_process_bgra_vx}, {"pre_process_gray_vx", pre_process_gray_vx}, {"pre_process_gray_2_vx", pre_process_gray_2_vx}, @@ -61844,6 +60420,9 @@ static const source_map_t evis_resource[] = {"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx}, {"pre_process_rgb888_planar_1_vx", pre_process_rgb888_planar_1_vx}, {"pre_process_rgb888_planar_2_vx", pre_process_rgb888_planar_2_vx}, + {"pre_process_rgb888_planar_sep_0_vx", pre_process_rgb888_planar_sep_0_vx}, + {"pre_process_rgb888_planar_sep_1_vx", pre_process_rgb888_planar_sep_1_vx}, + {"pre_process_rgb888_planar_sep_2_vx", pre_process_rgb888_planar_sep_2_vx}, {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx}, {"pre_process_yuv420_copy_u8_vx", pre_process_yuv420_copy_u8_vx}, {"pre_process_yuv420_scale_fp16_vx", pre_process_yuv420_scale_fp16_vx}, @@ -61893,7 +60472,9 @@ static const source_map_t evis_resource[] = {"resize_bilinear_U8_half_pixel_centers_1_vx", resize_bilinear_U8_half_pixel_centers_1_vx}, {"resize_bilinear_U8_half_pixel_centers_2_vx", resize_bilinear_U8_half_pixel_centers_2_vx}, {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx}, + {"resize_bilinear_align_corners_vx", resize_bilinear_align_corners_vx}, {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx}, + {"resize_bilinear_nhwc_bound_vx", resize_bilinear_nhwc_bound_vx}, {"resize_nearest_vx", resize_nearest_vx}, {"scatter_nd_vx", scatter_nd_vx}, {"scatter_nd_big_vx", scatter_nd_big_vx}, @@ -61934,6 +60515,8 @@ static const source_map_t cl_resource[] = {"clip_F32_cl", clip_F32_cl}, {"clip_I32_cl", clip_I32_cl}, {"clip_U8_cl", clip_U8_cl}, + {"cumsum_cl", cumsum_cl}, + {"cumsum_2d_cl", cumsum_2d_cl}, {"depth2space_crd_cl", depth2space_crd_cl}, {"detect_post_box_cl", detect_post_box_cl}, {"eltwise_ops_helper_cl", eltwise_ops_helper_cl}, @@ -61955,7 +60538,6 @@ static const source_map_t cl_resource[] = {"grucell_h_times_activation_r_cl", grucell_h_times_activation_r_cl}, {"grucell_reset_after_activation_cl", grucell_reset_after_activation_cl}, {"hswish_cl", hswish_cl}, - {"instance_normalization_f16_cl", instance_normalization_f16_cl}, {"instance_normalization_f32_cl", instance_normalization_f32_cl}, {"instance_normalization_i32_cl", instance_normalization_i32_cl}, {"instance_normalization_u8_cl", instance_normalization_u8_cl}, @@ -61992,7 +60574,10 @@ static const source_map_t cl_resource[] = {"matrixmul_cl", matrixmul_cl}, {"matrixmul_transA_cl", matrixmul_transA_cl}, {"maximum_cl", maximum_cl}, + {"maxpoolwithargmax_cl", maxpoolwithargmax_cl}, + {"maxpoolwithargmax_2d_cl", maxpoolwithargmax_2d_cl}, {"minimum_cl", minimum_cl}, + {"mod_cl", mod_cl}, {"moments_axis0_cl", moments_axis0_cl}, {"moments_axis01_cl", moments_axis01_cl}, {"moments_axis012_cl", moments_axis012_cl}, @@ -62036,6 +60621,7 @@ static const source_map_t cl_resource[] = {"swish_cl", swish_cl}, {"tile_cl", tile_cl}, {"topk_cl", topk_cl}, + {"topk_odd_even_sort_cl", topk_odd_even_sort_cl}, {"upsample_cl", upsample_cl}, }; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c index f1141ba..8fece69 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c @@ -63,43 +63,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - /* check inputs outputs data type */ - BEGIN_IO_TYPE_DECL(ABS, 1, 1) - /* IO_TYPE(INPUT, OUTPUT) */ - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_F32, D_BF16) + vsi_bool ret = vsi_nn_OpCheck(VSI_NN_OP_RELU, self, inputs, outputs); - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I16|Q_DFP) - - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_BF16, D_F32) - - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_F16) - - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) - END_IO_TYPE_DECL(ABS) - if(!VALIDATE_OP_IO_TYPES(ABS, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - - return TRUE; + return ret; } /* op_check() */ #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c index 073d063..46e689c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_addn.c @@ -154,7 +154,10 @@ static vsi_bool op_setup attr.dim_num = VSI_NN_DIM_AUTO; attr.vtl = TRUE; attr.is_const = FALSE; - if (_is_float32_data_format(self, inputs, outputs)) + if (VSI_NN_TYPE_INT32 == outputs[0]->attr.dtype.vx_type){ + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + } + else if(_is_float32_data_format( self, inputs, outputs )) { attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c index 06d439b..b0eea1f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c @@ -350,19 +350,19 @@ static vsi_bool _dynamic_check /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(BATCHNORM_SINGLE, 5, 1) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) - IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) END_IO_TYPE_DECL(BATCHNORM_SINGLE) if(!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, @@ -399,24 +399,33 @@ static vsi_bool _static_check ) { BEGIN_IO_TYPE_DECL(BATCH_NORM, 5, 1) - IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) - IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F16) - IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_F32, D_BF16) - IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I8|Q_ASYM) - IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32, D_F32, D_F32, D_F32, D_BF16) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_F32, D_F32, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_F32, D_F32, D_F16) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_F32, D_F32, D_I16|Q_SYM) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16) END_IO_TYPE_DECL(BATCH_NORM) if (!VALIDATE_OP_IO_TYPES(BATCH_NORM, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c index 4ffe7ed..1eaa783 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include @@ -154,13 +153,25 @@ static vsi_bool op_check IO_TYPE(D_U32, D_U32) IO_TYPE(D_U32, D_BOOL8) IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM) IO_TYPE(D_F16, D_U8|Q_ASYM) IO_TYPE(D_F16, D_BOOL8) IO_TYPE(D_I16|Q_DFP, D_F16) IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_BOOL8) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_BOOL8) + IO_TYPE(D_I16|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_BOOL8) IO_TYPE(D_I16, D_F16) IO_TYPE(D_I16, D_I8|Q_DFP) IO_TYPE(D_I16, D_U8|Q_ASYM) @@ -172,6 +183,14 @@ static vsi_bool op_check IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_DFP, D_BOOL8) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_BOOL8) + IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_BOOL8) IO_TYPE(D_I8, D_F16) IO_TYPE(D_I8, D_I16|Q_DFP) IO_TYPE(D_I8, D_U8|Q_ASYM) @@ -191,10 +210,18 @@ static vsi_bool op_check IO_TYPE(D_U8, D_U32) IO_TYPE(D_U8, D_F32) IO_TYPE(D_F32, D_I16|Q_DFP) + IO_TYPE(D_F32, D_I16|Q_ASYM) + IO_TYPE(D_F32, D_I16|Q_SYM) IO_TYPE(D_F32, D_I8|Q_DFP) + IO_TYPE(D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F32, D_I8|Q_SYM) IO_TYPE(D_F32, D_U8|Q_ASYM) IO_TYPE(D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I16|Q_ASYM) + IO_TYPE(D_I32, D_I16|Q_SYM) IO_TYPE(D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_I8|Q_ASYM) + IO_TYPE(D_I32, D_I8|Q_SYM) IO_TYPE(D_I32, D_U8|Q_ASYM) IO_TYPE(D_F16, D_F32) IO_TYPE(D_F16, D_I32) @@ -204,7 +231,11 @@ static vsi_bool op_check IO_TYPE(D_F16, D_F16) IO_TYPE(D_BOOL8, D_F16) IO_TYPE(D_BOOL8, D_I16|Q_DFP) + IO_TYPE(D_BOOL8, D_I16|Q_ASYM) + IO_TYPE(D_BOOL8, D_I16|Q_SYM) IO_TYPE(D_BOOL8, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_I8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I8|Q_SYM) IO_TYPE(D_BOOL8, D_U8|Q_ASYM) IO_TYPE(D_BOOL8, D_BOOL8) IO_TYPE(D_BOOL8, D_I16) @@ -212,12 +243,16 @@ static vsi_bool op_check IO_TYPE(D_BOOL8, D_U8) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_F32) IO_TYPE(D_U8|Q_ASYM, D_I32) IO_TYPE(D_BF16, D_BF16) END_IO_TYPE_DECL(CAST) - if(!VALIDATE_OP_IO_TYPES(CAST, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(CAST, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -228,7 +263,6 @@ static vsi_bool op_check return TRUE; } /* op_check() */ - static vsi_status op_optimize ( vsi_nn_node_t * self, @@ -249,7 +283,6 @@ static vsi_status op_optimize return status; } /* op_optimize() */ - static vsi_bool op_setup ( vsi_nn_node_t * self, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c b/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c index 69dbfd5..6fd097c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c @@ -42,30 +42,11 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(CEIL, 1, 1) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - END_IO_TYPE_DECL(CEIL) - if (!VALIDATE_OP_IO_TYPES(CEIL, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + vsi_bool ret = FALSE; - return TRUE; + ret = vsi_nn_OpCheck(VSI_NN_OP_FLOOR, self, inputs, outputs); + + return ret; } /* op_check() */ static vsi_status op_compute diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c index 6e7288b..b2b01f5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c @@ -151,6 +151,13 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_I16|Q_SYM, D_F16) IO_TYPE(D_BF16, D_BF16) + + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(CLIP) if (!VALIDATE_OP_IO_TYPES(CLIP, self, inputs, self->input.num, outputs, self->output.num)) { @@ -249,7 +256,6 @@ static vsi_bool op_setup return ret; } /* op_init() */ - #ifdef __cplusplus extern "C" { #endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c index 01721e6..5ebe3cf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c @@ -386,10 +386,18 @@ static vsi_bool op_setup if ( (self->nn_param.conv1d.ksize == 1024 && self->nn_param.conv1d.dilation == 1) || (self->nn_param.conv1d.ksize == 3 && self->nn_param.conv1d.dilation > 7) ) { - if (self->nn_param.conv1d.stride == 1 && self->nn_param.conv1d.multiplier == 0) + int32_t ksize = self->nn_param.conv1d.ksize; + int32_t stride = self->nn_param.conv1d.stride; + int32_t dilation = self->nn_param.conv1d.dilation; + int32_t real_kernel = ((ksize - 1) * dilation + ksize + stride - 1) / stride; +#define MAX_CONV1D_KERNEL_SIZE (255) + + if (self->nn_param.conv1d.stride == 1 && self->nn_param.conv1d.multiplier == 0 && + real_kernel > MAX_CONV1D_KERNEL_SIZE) { +#undef MAX_CONV1D_KERNEL_SIZE self->nn_param.conv1d.local->use_ovxlib_kernel = TRUE; - if ((p->pad[0] || p->pad[1]) && (inputs[0]->attr.size[0] >= 65535)) + if ((p->pad[0] || p->pad[1]) && (inputs[0]->attr.size[0] >= GPU_TENSOR_MAX_WIDTH)) { vsi_nn_tensor_attr_t attr; vsi_nn_internal_node_t* curr = NULL; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c index 7dbe943..ba50ffd 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c @@ -223,6 +223,60 @@ static vsi_bool op_check IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_F32) + + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_F16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM, D_NONE, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_F32) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_F32) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) @@ -244,6 +298,29 @@ static vsi_bool op_check IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I8|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_BF16) @@ -281,6 +358,9 @@ static vsi_bool op_check IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) IO_TYPE(D_F32, D_BF16, D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U8|Q_ASYM) + + /* HW 9.1.1 */ IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) @@ -295,6 +375,18 @@ static vsi_bool op_check IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U4|Q_ASYM) IO_TYPE(D_I4|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I4|Q_DFP) + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_I4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I4|Q_SYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM, D_I4|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I4|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I4|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U4|Q_ASYM) + END_IO_TYPE_DECL(CONV2D) ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, self->input.num, outputs, self->output.num); if(!ret) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c index 098c935..6aaa61d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c @@ -35,14 +35,10 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "utils/vsi_nn_constraint_check.h" -#define _ARG_NUM (3) #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) static vsi_status op_compute ( @@ -51,92 +47,7 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status = VSI_FAILURE; - vx_nn_stride_slice_params_t param; - vsi_nn_tensor_t *begin_dims_tensor = NULL; - vsi_nn_tensor_t *end_dims_tensor = NULL; - vsi_nn_tensor_t *stride_dims_tensor = NULL; - vsi_nn_tensor_attr_t attr; - vsi_size_t start[VSI_NN_MAX_DIM_NUM] = {0}; - vsi_size_t end[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t stride[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t i; - - memset(¶m, 0, sizeof(vx_nn_stride_slice_params_t)); - - for (i = 0; i < self->nn_param.crop.dims; i++) - { - start[i] = self->nn_param.crop.offset[i]; - end[i] = self->nn_param.crop.offset[i] + outputs[0]->attr.size[i]; - stride[i] = 1; - } - - for (i = self->nn_param.crop.dims; i < inputs[0]->attr.dim_num; i++) - { - start[i] = 0; - end[i] = outputs[0]->attr.size[i]; - stride[i] = 1; - } - - memset(&attr, 0, sizeof(attr)); - attr.size[0] = inputs[0]->attr.dim_num; - attr.dim_num = 1; - attr.is_const = TRUE; - attr.dtype.vx_type = VSI_NN_TYPE_INT32; - attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; - begin_dims_tensor = vsi_nn_CreateTensorFromData( - self->graph, - (uint8_t *)start, - &attr); - if( NULL == begin_dims_tensor ) - { - VSILOGE("Create begin_dims_tensor fail.(crop)"); - return VSI_FAILURE; - } - - end_dims_tensor = vsi_nn_CreateTensorFromData( - self->graph, - (uint8_t *)end, - &attr); - if( NULL == end_dims_tensor ) - { - VSILOGE("Create end_dims_tensor fail.(crop)"); - status = VSI_FAILURE; - goto OnError; - } - - stride_dims_tensor = vsi_nn_CreateTensorFromData( - self->graph, - (uint8_t *)stride, - &attr); - if( NULL == stride_dims_tensor ) - { - VSILOGE("Create stride_dims_tensor fail.(crop)"); - status = VSI_FAILURE; - goto OnError; - } - - param.begin_dims = REQUIRED_IO(begin_dims_tensor); - param.end_dims = REQUIRED_IO(end_dims_tensor); - param.stride_dims = REQUIRED_IO(stride_dims_tensor); - - self->n = vxTensorStrideSliceNode( - self->graph->g, - inputs[0]->t, - ¶m, - sizeof(vx_nn_stride_slice_params_t), - outputs[0]->t - ); - - if( NULL != self->n ) - { - status = VSI_SUCCESS; - } -OnError: - if (begin_dims_tensor) vsi_nn_ReleaseTensor(&begin_dims_tensor); - if (end_dims_tensor) vsi_nn_ReleaseTensor(&end_dims_tensor); - if (stride_dims_tensor) vsi_nn_ReleaseTensor(&stride_dims_tensor); - return status; + return vsi_nn_internal_compute_node( self ); } /* op_compute() */ static vsi_bool op_check @@ -153,6 +64,17 @@ static vsi_bool op_check return ret; } +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} + static vsi_bool op_setup ( vsi_nn_node_t * self, @@ -160,27 +82,32 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - vsi_nn_crop_param * p; - int32_t i; + vsi_nn_crop_param * p = NULL; + int32_t i = 0; + uint32_t j = 0; + vsi_nn_internal_node_t* curr = NULL; + + vsi_nn_internal_init_node_wksp( self ); p = (vsi_nn_crop_param *)&(self->nn_param.crop); + if (p->axis >= (int32_t)inputs[0]->attr.dim_num) { VSILOGE("Invalid parameter: axis!\n"); return FALSE; } - if( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO != outputs[0]->attr.dim_num ) { - return TRUE; + goto final; } if (p->dims + p->axis == inputs[0]->attr.dim_num) { - for(i = 0; i < p->axis; i++) + for (i = 0; i < p->axis; i++) { outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; } - for(i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++) + for (i = p->axis; i < (int32_t)inputs[0]->attr.dim_num; i++) { outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; } @@ -190,12 +117,12 @@ static vsi_bool op_setup { if (p->dims == 1) { - for(i = 0; i <= p->axis; i++) + for (i = 0; i <= p->axis; i++) { outputs[0]->attr.size[i] = inputs[1]->attr.size[i]; p->offset[i] = p->offset[0]; } - for(i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++) + for (i = p->axis + 1; i < (int32_t)inputs[0]->attr.dim_num; i++) { outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; } @@ -208,9 +135,78 @@ static vsi_bool op_setup } } +final: + for (j = 0; j < self->nn_param.crop.dims; j++) + { + p->lcl_data->begin_dims[j] = (int32_t)self->nn_param.crop.offset[j]; + p->lcl_data->end_dims[j] = (int32_t)self->nn_param.crop.offset[j] + (int32_t)outputs[0]->attr.size[j]; + p->lcl_data->stride_dims[j] = 1; + } + + for (j = self->nn_param.crop.dims; j < inputs[0]->attr.dim_num; j++) + { + p->lcl_data->begin_dims[j] = 0; + p->lcl_data->end_dims[j] = (int32_t)outputs[0]->attr.size[j]; + p->lcl_data->stride_dims[j] = 1; + } + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 ); + curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims; + curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.end_dims = p->lcl_data->end_dims; + curr->node->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.stride_dims = p->lcl_data->stride_dims; + curr->node->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num; + curr->node->nn_param.strided_slice.begin_mask = 0; + curr->node->nn_param.strided_slice.end_mask = 0; + curr->node->nn_param.strided_slice.shrink_axis_mask = 0; + curr->node->nn_param.strided_slice.new_axis_mask = 0; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node( self, curr ); + return TRUE; } /* op_setup() */ +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_crop_param * p = NULL; + + p = &(self->nn_param.crop); + + p->lcl_data = (vsi_nn_crop_lcl_data *)malloc(sizeof(vsi_nn_crop_lcl_data)); + if (NULL == p->lcl_data) + { + return VSI_FAILURE; + } + memset(p->lcl_data, 0, sizeof(vsi_nn_crop_lcl_data)); + + return status; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_crop_param * p = NULL; + + p = &(self->nn_param.crop); + + vsi_nn_safe_free(p->lcl_data); + + vsi_nn_internal_deinit_node_wksp( self ); + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + + + #ifdef __cplusplus extern "C" { #endif @@ -218,12 +214,12 @@ extern "C" { DEF_OP_REG ( /* op_name */ CROP, - /* init */ NULL, + /* init */ op_init, /* compute */ op_compute, - /* deinit */ vsi_nn_op_common_deinit, + /* deinit */ op_deinit, /* check */ op_check, /* setup */ op_setup, - /* optimize */ NULL, + /* optimize */ op_optimize, /* input_num */ 2, /* output_num */ 1 ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c new file mode 100644 index 0000000..d976b13 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cumsum.c @@ -0,0 +1,178 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_constraint_check.h" + +#define _ARG_NUM (1) +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_kernel_param_t * param = NULL; + vsi_nn_kernel_node_t n = NULL; + + int32_t axis = self->nn_param.cumsum.axis; + int32_t exclusive = (int32_t)self->nn_param.cumsum.exclusive; + int32_t reverse = (int32_t)self->nn_param.cumsum.reverse; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis ); + vsi_nn_kernel_param_add_int32( param, "exclusive", (int32_t)exclusive ); + vsi_nn_kernel_param_add_int32( param, "reverse", (int32_t)reverse ); + n = vsi_nn_kernel_selector( self->graph, "cumsum", inputs, 1, outputs, 1, param ); + if ( n != NULL ) + { + self->n = (vx_node)n; + status = VSI_SUCCESS; + } + + if (param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(CUMSUM, 1, 1) + IO_TYPE(D_U32, D_U32) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) + END_IO_TYPE_DECL(CUMSUM) + if (!VALIDATE_OP_IO_TYPES(CUMSUM, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + self->nn_param.cumsum.axis = 0; + self->nn_param.cumsum.exclusive = FALSE; + self->nn_param.cumsum.reverse = FALSE; + + return status; +} /* op_init() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + uint32_t i = 0; + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + for (i = 0; i < outputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + } + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CUMSUM, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index 1929167..e18c4bd 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -80,7 +80,6 @@ static vsi_bool _is_same_quant return TRUE; } /* _is_same_quant */ - static vsi_status op_optimize ( vsi_nn_node_t * self, @@ -237,34 +236,48 @@ static vsi_bool op_check IO_TYPE(D_BOOL8, D_U8|Q_ASYM) IO_TYPE(D_BOOL8, D_I8|Q_ASYM) IO_TYPE(D_BOOL8, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_I8|Q_SYM) IO_TYPE(D_BOOL8, D_I16|Q_DFP) - IO_TYPE(D_BOOL8, D_U8) - IO_TYPE(D_BOOL8, D_I8) - IO_TYPE(D_BOOL8, D_I8) - IO_TYPE(D_BOOL8, D_I16) + IO_TYPE(D_BOOL8, D_I16|Q_ASYM) + IO_TYPE(D_BOOL8, D_I16|Q_SYM) + IO_TYPE(D_BOOL8, D_I32) + IO_TYPE(D_BOOL8, D_U16) + IO_TYPE(D_BOOL8, D_U32) + IO_TYPE(D_U8|Q_ASYM, D_BOOL8) + IO_TYPE(D_I8|Q_ASYM, D_BOOL8) + IO_TYPE(D_I8|Q_DFP, D_BOOL8) + IO_TYPE(D_I8|Q_SYM, D_BOOL8) + IO_TYPE(D_I16|Q_DFP, D_BOOL8) + IO_TYPE(D_I16|Q_ASYM, D_BOOL8) + IO_TYPE(D_I16|Q_SYM, D_BOOL8) + IO_TYPE(D_I32, D_BOOL8) + IO_TYPE(D_U16, D_BOOL8) + IO_TYPE(D_U32, D_BOOL8) IO_TYPE(D_BF16, D_BF16) IO_TYPE(D_BF16, D_F16) IO_TYPE(D_BF16, D_F32) IO_TYPE(D_I32, D_I32) IO_TYPE(D_I32, D_I16|Q_DFP) - IO_TYPE(D_I32, D_I16) IO_TYPE(D_I32, D_I8|Q_DFP) - IO_TYPE(D_I32, D_I8) IO_TYPE(D_I32, D_U32) IO_TYPE(D_I32, D_U16) IO_TYPE(D_I32, D_U8|Q_ASYM) - IO_TYPE(D_I32, D_U8) IO_TYPE(D_U32, D_U32) IO_TYPE(D_U32, D_I16|Q_DFP) - IO_TYPE(D_U32, D_I16) IO_TYPE(D_U32, D_I8|Q_DFP) - IO_TYPE(D_U32, D_I8) IO_TYPE(D_U32, D_I32) - IO_TYPE(D_U32, D_U16) IO_TYPE(D_U32, D_U8|Q_ASYM) IO_TYPE(D_U32, D_U8) IO_TYPE(D_BF16, D_I32) IO_TYPE(D_I32, D_BF16) + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I4|Q_SYM) /* HW 9.0.1 */ IO_TYPE(D_I8|Q_DFP, D_BF16) @@ -276,6 +289,25 @@ static vsi_bool op_check IO_TYPE(D_I16|Q_DFP, D_BF16) IO_TYPE(D_I16|Q_DFP, D_F32) IO_TYPE(D_F16, D_F32) + + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM) + IO_TYPE(D_U4|Q_SYM, D_I8|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U4|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U4|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I4|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I4|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(DATACONVERT) if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c index 483a6dc..09c59d8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c @@ -198,190 +198,7 @@ static vsi_bool op_check { vsi_bool ret = FALSE; - BEGIN_IO_TYPE_DECL(DECONVOLUTION, 3, 1) - IO_TYPE(D_F16, D_F16, D_NONE, D_F16) - IO_TYPE(D_F16, D_F16, D_F32, D_F16) - IO_TYPE(D_F16, D_F16, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) - IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F32, D_BF16) - IO_TYPE(D_F32, D_F32, D_NONE, D_F32) - - /* HW 9.0 */ - IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) - IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) - - /* HW 9.0.1 */ - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) - - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F32) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) - - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_F32) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_F32) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F32) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_F32) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) - - IO_TYPE(D_F16, D_F16, D_NONE, D_BF16) - IO_TYPE(D_F16, D_F16, D_NONE, D_F32) - IO_TYPE(D_F16, D_F16, D_F32, D_BF16) - IO_TYPE(D_F16, D_F16, D_F32, D_F32) - - IO_TYPE(D_BF16, D_BF16, D_NONE, D_F16) - IO_TYPE(D_BF16, D_BF16, D_F32, D_F16) - - IO_TYPE(D_F32, D_BF16, D_NONE, D_F16) - IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) - IO_TYPE(D_F32, D_BF16, D_NONE, D_F32) - IO_TYPE(D_F32, D_BF16, D_F32, D_F16) - IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) - IO_TYPE(D_F32, D_BF16, D_F32, D_F32) - - END_IO_TYPE_DECL(DECONVOLUTION) - if (!VALIDATE_OP_IO_TYPES(DECONVOLUTION, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - - /* Check fl and scale*/ - ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]); + ret = vsi_nn_OpCheck(VSI_NN_OP_CONV2D, self, inputs, outputs); return ret; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c index eb8f75b..0692666 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c @@ -220,7 +220,7 @@ static vsi_bool op_check { vsi_bool ret = FALSE; - ret = vsi_nn_OpCheck(VSI_NN_OP_DECONVOLUTION, self, inputs, outputs); + ret = vsi_nn_OpCheck(VSI_NN_OP_CONV1D, self, inputs, outputs); return ret; } /* op_check() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c index c1c4404..6b7cc6f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c @@ -105,20 +105,32 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(DEPTH2SPACE_INTERNAL, 1, 1) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) END_IO_TYPE_DECL(DEPTH2SPACE_INTERNAL) - if(!VALIDATE_OP_IO_TYPES(DEPTH2SPACE_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(DEPTH2SPACE_INTERNAL, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c index f63db97..fa53367 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c @@ -70,32 +70,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(DROPOUT, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - END_IO_TYPE_DECL(DROPOUT) - if (!VALIDATE_OP_IO_TYPES(DROPOUT, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + vsi_bool ret = vsi_nn_OpCheck(VSI_NN_OP_LINEAR, self, inputs, outputs); - return TRUE; + return ret; } /* op_check() */ static vsi_bool op_setup @@ -140,4 +117,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c index 496d42e..73ba406 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -350,30 +350,63 @@ static vsi_bool op_check_pow { /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(POW, 2, 1) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) - IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) - IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_I8|Q_SYM, D_F16) + IO_TYPE(D_F16, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_ASYM, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I16|Q_SYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_ASYM, D_I16|Q_SYM) END_IO_TYPE_DECL(POW) - if(!VALIDATE_OP_IO_TYPES(POW, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(POW, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -398,30 +431,56 @@ static vsi_bool op_check_add IO_TYPE(D_BF16, D_BF16, D_F32) IO_TYPE(D_F16, D_F16, D_F16) IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_SYM) IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I8|Q_SYM) IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_SYM, D_F16) IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I8|Q_SYM, D_F16) IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16, D_F16) IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F16, D_I16|Q_SYM) IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F16, D_I8|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_F16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) @@ -491,8 +550,14 @@ static vsi_bool op_check_add IO_TYPE(D_F32, D_BF16, D_BF16) IO_TYPE(D_F32, D_BF16, F32) + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(ADD) - if(!VALIDATE_OP_IO_TYPES(ADD, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(ADD, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -527,45 +592,65 @@ static vsi_bool op_check_div { /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(DIVIDE, 2, 1) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F16) - IO_TYPE(D_F32, D_F16, D_F32) - IO_TYPE(D_F32, D_F16, D_F16) - IO_TYPE(D_F16, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F16) - IO_TYPE(D_F16, D_F16, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) - IO_TYPE(D_I16, D_I32, D_I32) - IO_TYPE(D_I32, D_I16, D_I32) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_SYM) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I8|Q_SYM, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I8|Q_SYM, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_SYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F16, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16, D_I32, D_I32) + IO_TYPE(D_I32, D_I16, D_I32) END_IO_TYPE_DECL(DIVIDE) - if(!VALIDATE_OP_IO_TYPES(DIVIDE, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(DIVIDE, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -596,22 +681,40 @@ static vsi_bool op_check_mul IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16, D_F16) IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F16, D_I16|Q_SYM) IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F16, D_I8|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_F16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) @@ -677,8 +780,14 @@ static vsi_bool op_check_mul IO_TYPE(D_F32, D_BF16, D_BF16) IO_TYPE(D_F32, D_BF16, F32) + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(MULTIPLY) - if(!VALIDATE_OP_IO_TYPES(MULTIPLY, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(MULTIPLY, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c index 9a85fd1..7dc29af 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -178,6 +178,12 @@ static vsi_bool op_check IO_TYPE(D_I8|Q_DFP, D_F16) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F16) + + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM) END_IO_TYPE_DECL(ELTWISE_UNARY) if (!VALIDATE_OP_IO_TYPES(ELTWISE_UNARY, self, inputs, self->input.num, outputs, self->output.num)) { @@ -252,6 +258,9 @@ DEF_ELEMENT_WISE_UNARY_OP( ROUND, round ); DEF_ELEMENT_WISE_UNARY_OP( GELU, gelu ); DEF_ELEMENT_WISE_UNARY_OP( SELU, selu ); DEF_ELEMENT_WISE_UNARY_OP( CELU, celu ); +DEF_ELEMENT_WISE_UNARY_OP( RCP, rcp ); +DEF_ELEMENT_WISE_UNARY_OP( SIGN, sign ); +DEF_ELEMENT_WISE_UNARY_OP( SOFTSIGN, softsign ); #undef DEF_ELEMENT_UNARY_WISE_OP diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c index a789f2c..84e36be 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include @@ -93,6 +92,13 @@ static vsi_bool op_check IO_TYPE(D_I8|Q_DFP, D_F16) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F16) + + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(ERF) if (!VALIDATE_OP_IO_TYPES(ERF, self, inputs, self->input.num, outputs, self->output.num)) { @@ -106,7 +112,6 @@ static vsi_bool op_check return TRUE; } /* op_check() */ - __BEGIN_DECLS /* Registrar */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c index df4aa95..6bb4dad 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c @@ -54,25 +54,37 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(EXPAND_BROADCAST, 1, 1) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_I32|Q_DFP, D_I32|Q_DFP) - IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_I32|Q_DFP, D_I32|Q_DFP) + IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM) END_IO_TYPE_DECL(EXPAND_BROADCAST) if (!VALIDATE_OP_IO_TYPES(EXPAND_BROADCAST, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c index 1e9d5a7..9285e6e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floor.c @@ -43,18 +43,32 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(FLOOR, 1, 1) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) END_IO_TYPE_DECL(FLOOR) if (!VALIDATE_OP_IO_TYPES(FLOOR, self, inputs, self->input.num, outputs, self->output.num)) { @@ -80,7 +94,7 @@ static vsi_status op_compute memset(&p, 0, sizeof(p)); p.mode = VX_NN_DS_SIZE_ROUNDING_FLOOR; self->n = vxTensorRoundingNode(self->graph->g, inputs[0]->t, &p, sizeof(p), outputs[0]->t); - if( !self->n ) + if ( !self->n ) { status = VSI_FAILURE; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c index 812c7df..8026198 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c @@ -229,6 +229,60 @@ static vsi_bool op_check IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_F16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_F32) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_BF16) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I64|Q_SYM, D_F32) + + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I64|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) @@ -291,6 +345,33 @@ static vsi_bool op_check IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) IO_TYPE(D_F32, D_BF16, D_F32, D_F32) + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I4|Q_DFP) + + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_SYM, D_I4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I4|Q_SYM) + IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM, D_I4|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I4|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I4|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I4|Q_SYM) + IO_TYPE(D_I4|Q_SYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U4|Q_ASYM) + END_IO_TYPE_DECL(FCL) ret = VALIDATE_OP_IO_TYPES(FCL, self, inputs, self->input.num, outputs, self->output.num); if(!ret) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c index 6c1bdc2..b91fec8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c @@ -185,7 +185,8 @@ static vsi_bool op_check /* TP Support */ if (!ret ) { uint32_t valid_dtypes[] = { - D_F16, D_BF16, D_F32, D_I16|Q_DFP, D_I8|Q_DFP, D_I8|Q_ASYM, D_U8|Q_DFP, D_U8|Q_ASYM + D_F16, D_BF16, D_F32, D_I16|Q_DFP, D_I16|Q_SYM, D_I16|Q_ASYM, D_I8|Q_DFP, D_I8|Q_SYM, + D_I8|Q_ASYM, D_U8|Q_DFP, D_U8|Q_ASYM }; uint32_t weight_type = inputs[1]->attr.dtype.vx_type | inputs[1]->attr.dtype.qnt_type << Q_SHIFT; @@ -332,7 +333,6 @@ static vsi_bool op_setup } } - if( NULL == inputs[1]->wb ) { VSILOGE( "Create weight bias fail." ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c index 21e0a17..00545d3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c @@ -34,7 +34,6 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" @@ -202,26 +201,40 @@ static vsi_bool _op_check ) { BEGIN_IO_TYPE_DECL(GROUP_NORM, 3, 1) - IO_TYPE(D_F16, D_F32, D_F16, D_F16) - IO_TYPE(D_F16, D_F32, D_F32, D_F16) - IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F32, D_F32, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_F32, D_F16, D_F32) - IO_TYPE(D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_I32, D_F32, D_F16, D_I32) - IO_TYPE(D_I32, D_F32, D_F16, D_F32) + IO_TYPE(D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I16|Q_SYM) + IO_TYPE(D_I32, D_F32, D_F16, D_I32) + IO_TYPE(D_I32, D_F32, D_F16, D_F32) IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F16) IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_F16) IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_I8|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F16) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_F16) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM) END_IO_TYPE_DECL(GROUP_NORM) if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num)) { @@ -263,21 +276,11 @@ static vsi_status _op_deinit ) { vsi_nn_groupnormalize_param *p = &(self->nn_param.groupnorm); - if (p->lcl_data->reshaped_input) - { - vsi_nn_ReleaseTensor(&(p->lcl_data->reshaped_input)); - p->lcl_data->reshaped_input = NULL; - } - if (p->lcl_data->reshaped_output) - { - vsi_nn_ReleaseTensor(&(p->lcl_data->reshaped_output)); - p->lcl_data->reshaped_output = NULL; - } - if (self->nn_param.groupnorm.lcl_data) - { - free(self->nn_param.groupnorm.lcl_data); - self->nn_param.groupnorm.lcl_data = NULL; - } + + vsi_safe_release_tensor(p->lcl_data->reshaped_input); + vsi_safe_release_tensor(p->lcl_data->reshaped_output); + vsi_nn_safe_free(self->nn_param.groupnorm.lcl_data) + vsi_nn_op_common_deinit(self); return VSI_SUCCESS; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c index cdead0c..be1f3f5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c @@ -34,7 +34,6 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" @@ -175,24 +174,42 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(INSTANCE_NORM, 3, 1) - IO_TYPE(D_F16, D_F32, D_F16, D_F16) - IO_TYPE(D_F16, D_F32, D_F32, D_F16) - IO_TYPE(D_F16, D_F16, D_F16, D_F16) - IO_TYPE(D_F32, D_F32, D_F16, D_F32) - IO_TYPE(D_F32, D_F16, D_F16, D_F32) - IO_TYPE(D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_I32, D_F32, D_F16, D_I32) - IO_TYPE(D_I32, D_F32, D_F16, D_F32) - IO_TYPE(D_BF16, D_F32, D_F32, D_BF16) + IO_TYPE(D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16, D_F32) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_F32, D_F16, D_I32) + IO_TYPE(D_I32, D_F32, D_F16, D_F32) + IO_TYPE(D_BF16, D_F32, D_F32, D_BF16) IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_F16) IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I8|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I16|Q_SYM) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM) END_IO_TYPE_DECL(INSTANCE_NORM) if (!VALIDATE_OP_IO_TYPES(INSTANCE_NORM, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index fc378ad..74623e2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -34,12 +34,12 @@ #include "vsi_nn_tensor_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" #define _INPUT_NUM (3) #define _OUTPUT_NUM (1) +#define VSI_NN_SUPPORT_AXIS (0) static vsi_status op_compute ( @@ -52,13 +52,12 @@ static vsi_status op_compute vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; float eps = self->nn_param.layernorm.eps; - - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 && - inputs[2]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 ) +#if VSI_NN_SUPPORT_AXIS + if ( 0 ) { return vsi_nn_internal_compute_node( self ); } +#endif param = vsi_nn_kernel_param_create(); @@ -87,18 +86,18 @@ static vsi_bool op_setup ) { vsi_bool ret = TRUE; +#if VSI_NN_SUPPORT_AXIS vsi_nn_internal_node_t* curr = NULL; +#endif if ( NULL == self ) { return FALSE; } - +#if VSI_NN_SUPPORT_AXIS vsi_nn_internal_init_node_wksp( self ); - if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16 && - inputs[2]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && - outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 ) + if ( 0 ) { vsi_nn_internal_tensor_t* mean_tensor = NULL; vsi_nn_internal_tensor_t* vari_tensor = NULL; @@ -137,6 +136,7 @@ static vsi_bool op_setup vsi_nn_internal_setup_node( self, curr ); } else +#endif { ret = vsi_nn_op_common_setup(self, inputs, outputs); } @@ -152,18 +152,52 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1) - IO_TYPE(D_F32, D_F32, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F16, D_F16) - IO_TYPE(D_F16, D_F32, D_F32, D_F16) - IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F32, D_F32, D_U8|Q_ASYM) - IO_TYPE(D_BF16, D_F32, D_F32, D_BF16) + IO_TYPE(D_F32, D_F32, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F32, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F16, D_I16|Q_SYM) + IO_TYPE(D_F16, D_F32, D_F32, D_I16|Q_SYM) + IO_TYPE(D_BF16, D_F32, D_F32, D_BF16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F16, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F16, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F16, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F16) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F32, D_F32, D_F16) END_IO_TYPE_DECL(LAYER_NORM) if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num)) { @@ -182,18 +216,9 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - uint32_t i = 0; - for (i = 0; i < _VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM; i++) - { - if (self->nn_param.layernorm.local.local_tensor[i] != NULL) - { - vxReleaseTensor(&(self->nn_param.layernorm.local.local_tensor[i])); - self->nn_param.layernorm.local.local_tensor[i] = NULL; - } - } - +#if VSI_NN_SUPPORT_AXIS vsi_nn_internal_deinit_node_wksp( self ); - +#endif vsi_nn_op_common_deinit(self); return VSI_SUCCESS; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c index 5ac26a6..be32c4b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_leaky_relu.c @@ -66,33 +66,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(LEAKY_RELU, 1, 1) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - END_IO_TYPE_DECL(LEAKY_RELU) - if (!VALIDATE_OP_IO_TYPES(LEAKY_RELU, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + vsi_bool ret = vsi_nn_OpCheck(VSI_NN_OP_RELU, self, inputs, outputs); - return TRUE; + return ret; } /* op_check() */ #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c index 3e79acc..097075d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c @@ -74,33 +74,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(LINEAR, 1, 1) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - END_IO_TYPE_DECL(LINEAR) - if (!VALIDATE_OP_IO_TYPES(LINEAR, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + vsi_bool ret = vsi_nn_OpCheck(VSI_NN_OP_RELU, self, inputs, outputs); - return TRUE; + return ret; } /* op_check() */ __BEGIN_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c index 00030fe..fd12173 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c @@ -140,22 +140,34 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(LOG_SOFTMAX, 1, 1) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_BF16, D_F16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_BF16, D_F16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16) END_IO_TYPE_DECL(LOG_SOFTMAX) - if(!VALIDATE_OP_IO_TYPES(LOG_SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(LOG_SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -206,7 +218,6 @@ DEF_OP_REG \ DEF_LOG_SOFTMAX_OP( LOG_SOFTMAX, log_softmax ); - #undef DEF_LOG_SOFTMAX_OP #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c new file mode 100644 index 0000000..57f8cad --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_max_pool3d.c @@ -0,0 +1,300 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "utils/vsi_nn_link_list.h" +#include "vsi_nn_internal_node.h" + +typedef struct _max_pool3d_local_data_t { + int32_t placeholder; +} max_pool3d_local_data_t; + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = FALSE; + + ret = vsi_nn_OpCheck(VSI_NN_OP_POOL, self, inputs, outputs); + + return ret; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + vsi_nn_max_pool3d_param *p = &(self->nn_param.max_pool3d); + vsi_size_t ksize[_cnt_of_array(p->ksize)] = {0}, i = 0; + vsi_size_t pad[_cnt_of_array(p->pad)] = {0}; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t* input_tensor = NULL; + vsi_nn_internal_tensor_t* pool2d_0_tensor = NULL; + vsi_nn_internal_tensor_t* reshape_0_tensor = NULL; + vsi_nn_internal_tensor_t* pool2d_1_tensor = NULL; + vsi_nn_tensor_attr_t attr; + vsi_size_t* reshape_input_size = NULL; + vsi_size_t* reshape_pool_size = NULL; + + for (i = 0; i < _cnt_of_array(p->ksize); i++) + { + ksize[i] = p->ksize[i]; + } + for (i = 0; i < _cnt_of_array(p->pad); i++) + { + pad[i] = p->pad[i]; + } + + vsi_nn_compute_padding_3d( + inputs[0]->attr.size, + ksize, + p->stride, + NULL, + p->pad_type, + pad + ); + + for (i = 0; i < _cnt_of_array(p->ksize); i++) + { + p->ksize[i] = (uint32_t)ksize[i]; + } + + for (i = 0; i < _cnt_of_array(p->pad); i++) + { + p->pad[i] = (uint32_t)pad[i]; + } + + /* Pooling */ + outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[0], + p->ksize[0], + &p->pad[0], + p->stride[0], + 0, + p->round_type + ); + + outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[1], + p->ksize[1], + &p->pad[2], + p->stride[1], + 0, + p->round_type + ); + + outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize + ( + inputs[0]->attr.size[2], + p->ksize[2], + &p->pad[4], + p->stride[2], + 0, + p->round_type + ); + + for (i = 3; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } + + outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; + + vsi_nn_internal_init_node_wksp( self ); + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE); + input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + pool2d_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + reshape_input_size = vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + reshape_input_size[0] = inputs[0]->attr.size[0]; + reshape_input_size[1] = inputs[0]->attr.size[1]; + reshape_input_size[2] = 1; + for (i = 2; i < inputs[0]->attr.dim_num; i++) + { + reshape_input_size[2] *= inputs[0]->attr.size[i]; + } + reshape_input_size[3] = 1; + curr->node->nn_param.reshape2.size = reshape_input_size; + curr->node->nn_param.reshape2.dim_num = 4; + curr->inputs[0] = inputs[0]; + curr->outputs[0] = input_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 0, 0 ); + curr->node->nn_param.pool.ksize[0] = p->ksize[0]; + curr->node->nn_param.pool.ksize[1] = p->ksize[1]; + curr->node->nn_param.pool.stride[0] = p->stride[0]; + curr->node->nn_param.pool.stride[1] = p->stride[1]; + curr->node->nn_param.pool.pad[0] = p->pad[0]; + curr->node->nn_param.pool.pad[1] = p->pad[1]; + curr->node->nn_param.pool.pad[2] = p->pad[2]; + curr->node->nn_param.pool.pad[3] = p->pad[3]; + curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX; + curr->node->nn_param.pool.round_type = p->round_type; + curr->node->nn_param.pool.pad_type = p->pad_type; + curr->inputs[0] = input_tensor->t; + curr->outputs[0] = pool2d_0_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + if (p->ksize[2] == 1 && p->stride[2] == 1 && p->pad[4] == 0 && p->pad[5] == 0) + { + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + curr->node->nn_param.reshape2.size = outputs[0]->attr.size; + curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; + curr->inputs[0] = pool2d_0_tensor->t; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node( self, curr ); + } + else + { + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE); + reshape_0_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + pool2d_1_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + reshape_pool_size = vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + reshape_pool_size[0] = -1; + reshape_pool_size[1] = inputs[0]->attr.size[2]; + reshape_pool_size[2] = 1; + for (i = 3; i < inputs[0]->attr.dim_num; i++) + { + reshape_pool_size[2] *= inputs[0]->attr.size[i]; + } + reshape_pool_size[3] = 1; + curr->node->nn_param.reshape2.size = reshape_pool_size; + curr->node->nn_param.reshape2.dim_num = 4; + curr->inputs[0] = pool2d_0_tensor->t; + curr->outputs[0] = reshape_0_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_POOL, 1, 1 ); + curr->node->nn_param.pool.ksize[0] = 1; + curr->node->nn_param.pool.ksize[1] = p->ksize[2]; + curr->node->nn_param.pool.stride[0] = 1; + curr->node->nn_param.pool.stride[1] = p->stride[2]; + curr->node->nn_param.pool.pad[0] = 0; + curr->node->nn_param.pool.pad[1] = 0; + curr->node->nn_param.pool.pad[2] = p->pad[4]; + curr->node->nn_param.pool.pad[3] = p->pad[5]; + curr->node->nn_param.pool.type = VX_CONVOLUTIONAL_NETWORK_POOLING_MAX; + curr->node->nn_param.pool.round_type = p->round_type; + curr->node->nn_param.pool.pad_type = p->pad_type; + curr->inputs[0] = reshape_0_tensor->t; + curr->outputs[0] = pool2d_1_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 ); + curr->node->nn_param.reshape2.size = outputs[0]->attr.size; + curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num; + curr->inputs[0] = pool2d_1_tensor->t; + curr->outputs[0] = outputs[0]; + vsi_nn_internal_setup_node( self, curr ); + } + + return ret; +} /* op_setup() */ + + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_status status = VSI_SUCCESS; + + vsi_nn_internal_deinit_node_wksp( self ); + status = vsi_nn_op_common_deinit(self); + + return status; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ MAX_POOL3D, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c new file mode 100644 index 0000000..3432790 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c @@ -0,0 +1,223 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_log.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" +#include "utils/vsi_nn_constraint_check.h" + +/* + Declare number of input and output. + */ +#define _INPUT_NUM (1) +#define _OUTPUT_NUM (2) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + uint32_t new_rank = 0; + vsi_nn_kernel_param_t * param = NULL; + int32_t ksize_x = (int32_t)self->nn_param.pool.ksize[0]; + int32_t ksize_y = (int32_t)self->nn_param.pool.ksize[1]; + int32_t stride_x = (int32_t)self->nn_param.pool.stride[0]; + int32_t stride_y = (int32_t)self->nn_param.pool.stride[1]; + int32_t pad_left = (int32_t)self->nn_param.pool.pad[0]; + int32_t pad_right = (int32_t)self->nn_param.pool.pad[1]; + int32_t pad_top = (int32_t)self->nn_param.pool.pad[2]; + int32_t pad_bottom = (int32_t)self->nn_param.pool.pad[3]; + + if ( NULL == self ) + { + return VSI_FAILURE; + } + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_optimize_nchw2xhw_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, + shapes[0], &new_rank); + vsi_nn_kernel_optimize_nchw2xhw_shape(outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[1], &new_rank); + if (new_rank == 3 && shapes[1][2] == 1) + { + new_rank = 2; + } + + vsi_nn_kernel_param_add_int32( param, "ksize_x", ksize_x ); + vsi_nn_kernel_param_add_int32( param, "ksize_y", ksize_y ); + vsi_nn_kernel_param_add_int32( param, "stride_x", stride_x ); + vsi_nn_kernel_param_add_int32( param, "stride_y", stride_y ); + vsi_nn_kernel_param_add_int32( param, "pad_left", pad_left ); + vsi_nn_kernel_param_add_int32( param, "pad_right", pad_right ); + vsi_nn_kernel_param_add_int32( param, "pad_top", pad_top ); + vsi_nn_kernel_param_add_int32( param, "pad_bottom", pad_bottom ); + + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + outputs[0], shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[1], shapes[1], new_rank ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "maxpoolwithargmax", + &reshape_tensors[0], _INPUT_NUM, &reshape_tensors[1], _OUTPUT_NUM, param ); + + vsi_safe_release_tensor(reshape_tensors[0]); + vsi_safe_release_tensor(reshape_tensors[1]); + vsi_safe_release_tensor(reshape_tensors[2]); + + if ( self->n ) + { + status = VSI_SUCCESS; + } + + if (param != NULL) + { + vsi_nn_kernel_param_release( ¶m ); + } + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(MAXPOOLWITHARGMAX, 1, 2) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_F32, D_F32, D_I32) + IO_TYPE(D_F16, D_F16, D_I32) + IO_TYPE(D_BF16, D_BF16, D_I32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32) + END_IO_TYPE_DECL(MAXPOOLWITHARGMAX) + if (!VALIDATE_OP_IO_TYPES(MAXPOOLWITHARGMAX, self, inputs, self->input.num, outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /* TODO: Add code to comput outputs' shape. */ + vsi_bool ret = TRUE; + vsi_size_t ksize[_cnt_of_array(self->nn_param.pool.ksize)] = {0}; + vsi_size_t i = 0; + vsi_size_t pad[_cnt_of_array(self->nn_param.pool.pad)] = {0}; + for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + { + ksize[i] = self->nn_param.pool.ksize[i]; + } + for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + { + pad[i] = self->nn_param.pool.pad[i]; + } + + vsi_nn_compute_padding( + inputs[0]->attr.size, + ksize, + self->nn_param.pool.stride, + NULL, + self->nn_param.pool.pad_type, + pad + ); + for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + { + self->nn_param.pool.ksize[i] = (uint32_t)ksize[i]; + } + for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + { + self->nn_param.pool.pad[i] = (uint32_t)pad[i]; + } + + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + { + ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs ); + + outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; + memcpy( outputs[1]->attr.size, outputs[0]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); + } + + return ret; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t* self + ) +{ + vsi_nn_op_common_deinit(self); + + return VSI_SUCCESS; +} /* op_deinit() */ + +__BEGIN_DECLS + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ MAXPOOLWITHARGMAX, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); + +__END_DECLS + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c new file mode 100644 index 0000000..29310ad --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_mod.c @@ -0,0 +1,237 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + + +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "utils/vsi_nn_math.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_log.h" +#include "kernel/vsi_nn_kernel_eltwise.h" +#include "utils/vsi_nn_constraint_check.h" + +typedef struct _mod_local_data_t { + int32_t placeholder; +} mod_local_data_t; + +#define _INPUT_NUM (2) +#define _OUTPUT_NUM (1) + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_status status = VSI_FAILURE; + vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t new_rank = 0; + vsi_bool ret; + vsi_nn_kernel_param_t * param = NULL; + int32_t isfmod = (int32_t)self->nn_param.mod.fmod; + + if (NULL == self) + { + return VSI_FAILURE; + } + + param = vsi_nn_kernel_param_create(); + + ret = vsi_nn_kernel_optimize_eltwise_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, + shapes[0], shapes[1], shapes[2], &new_rank ); + + vsi_nn_kernel_param_add_int32( param, "isfmod", isfmod ); + + if (ret) + { + reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, + inputs[0], shapes[0], new_rank ); + reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, + inputs[1], shapes[1], new_rank ); + reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, + outputs[0], shapes[2], new_rank ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "mod", + &reshape_tensors[0], _INPUT_NUM, + &reshape_tensors[2], _OUTPUT_NUM, param ); + vsi_nn_ReleaseTensor( &reshape_tensors[0] ); + vsi_nn_ReleaseTensor( &reshape_tensors[1] ); + vsi_nn_ReleaseTensor( &reshape_tensors[2] ); + } + + if (self->n) + { + status = VSI_SUCCESS; + } + + vsi_nn_kernel_param_release( ¶m ); + + return status; +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + BEGIN_IO_TYPE_DECL(MOD, 2, 1) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16, D_I32, D_I32) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I32, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_F16, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I32, D_I8|Q_ASYM) + IO_TYPE(D_I32, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I32, D_I32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I32, D_I8|Q_SYM) + IO_TYPE(D_I32, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I32, D_I32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_F16, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_I32, D_I16|Q_ASYM) + IO_TYPE(D_I32, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I32, D_I32, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I32, D_I16|Q_SYM) + IO_TYPE(D_I32, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I32, D_I32, D_I16|Q_SYM) + END_IO_TYPE_DECL(MOD) + if (!VALIDATE_OP_IO_TYPES(MOD, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + + return TRUE; +} /* op_check() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i, out_rank, in1_rank, in2_rank; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_bool ret = TRUE; + + in1_rank = inputs[0]->attr.dim_num; + in2_rank = inputs[1]->attr.dim_num; + out_rank = vsi_nn_max( in1_rank, in2_rank ); + + for(i = 0; i < out_rank; i++) + { + vsi_size_t sz0, sz1; + sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1; + sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1; + shape[i] = vsi_nn_max( sz0, sz1 ); + } + + if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) + { + outputs[0]->attr.dim_num = out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); + } + else + { + vsi_size_t total_size_got; + vsi_size_t total_size_expected; + total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); + total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, + outputs[0]->attr.dim_num ); + if (total_size_expected != total_size_got) + { + VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", + total_size_expected, total_size_got); + ret = FALSE; + } + } + + return ret; +} /* op_setup() */ + +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ MOD, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ vsi_nn_op_common_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ NULL, + /* input_num */ _INPUT_NUM, + /* output_num */ _OUTPUT_NUM + ); +#ifdef __cplusplus +} +#endif + diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c index 28602c7..eb15ccc 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nbg.c @@ -53,20 +53,63 @@ static const char *_get_vx_nbg_type } static void _set_io_index - ( - vsi_nn_node_t * self, - vsi_nn_tensor_t ** inputs, - vsi_nn_tensor_t ** outputs - ) +( + vsi_nn_node_t* self, + vsi_nn_tensor_t** inputs, + vsi_nn_tensor_t** outputs +) { - uint32_t idx,i; + uint32_t idx, i, j; idx = 0; - for(i = 0; i < self->input.num; i++) + for (i = 0; i < self->input.num; i++) { + uint32_t scalar_index=0; + vx_parameter param = 0; + vx_enum type = 0; + vxSetParameterByIndex(self->n, idx++, (vx_reference)inputs[i]->t); + scalar_index = idx; + param = vxGetParameterByIndex(self->n, scalar_index); + vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (param != NULL) + { + vxReleaseParameter(¶m); + param = NULL; + + } + if (type != VX_TYPE_SCALAR) + { + continue; + } + else + { + + /* 4 crop scalar parameters input */ + for (j = scalar_index; j < scalar_index + 4; j++) + { + vx_enum data_type = 0; + vx_reference ref = 0; + vsi_status status; + param = vxGetParameterByIndex(self->n, j); + vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference)); + status = vxQueryScalar((vx_scalar)ref, VX_SCALAR_TYPE, &data_type, sizeof(vx_enum)); + if (status == VX_ERROR_INVALID_REFERENCE) + { + vx_scalar scalar = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, 0); + ref = (vx_reference)scalar; + vxSetParameterByIndex(self->n, idx++, ref); + vxReleaseReference(&ref); + } + if (param != NULL) + { + vxReleaseParameter(¶m); + param = NULL; + } + } + } } - for(i = 0; i < self->output.num; i++) + for (i = 0; i < self->output.num; i++) { vxSetParameterByIndex(self->n, idx++, (vx_reference)outputs[i]->t); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c index e6e5d72..c1d35eb 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c @@ -48,13 +48,14 @@ vsi_status vsi_nn_InitPadParameter uint8_t i; vsi_status status = VSI_FAILURE; - if(NULL == node || NULL == param) + memset(param, 0, sizeof(vx_nn_pad_params_t)); + + if (NULL == node) { VSILOGE("Set param fail\n"); return VSI_FAILURE; } - memset(param, 0, sizeof(vx_nn_pad_params_t)); pad_const_val = node->nn_param.pad.const_val; param->pad_mode = node->nn_param.pad.mode; param->pad_const = vxCreateScalar( node->graph->ctx->c, VX_TYPE_INT32, &pad_const_val ); @@ -139,10 +140,10 @@ static vsi_status op_compute vsi_nn_tensor_t *convert_tensor = NULL; status = VSI_FAILURE; - if(VSI_SUCCESS != vsi_nn_InitPadParameter(self, &p)) + if (VSI_SUCCESS != vsi_nn_InitPadParameter(self, &p)) { VSILOGE("Set Pad Layer Parameter fail\n"); - return VSI_FAILURE; + goto final; } if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) @@ -174,14 +175,15 @@ static vsi_status op_compute sizeof(p) ); - vsi_nn_DeinitPadParameter(&p); - vsi_safe_release_tensor(convert_tensor); - - if( NULL != self->n ) + if ( NULL != self->n ) { status = VSI_SUCCESS; } +final: + vsi_nn_DeinitPadParameter(&p); + vsi_safe_release_tensor(convert_tensor); + return status; } /* op_compute() */ @@ -193,14 +195,26 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(PAD, 1, 1) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I32, D_I32) + + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(PAD) if (!VALIDATE_OP_IO_TYPES(PAD, self, inputs, self->input.num, outputs, self->output.num)) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c index d0b89aa..bd01a72 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include @@ -71,6 +70,48 @@ static int32_t _get_vx_pad_mode(vx_enum mode) return pad_mode; } +static int32_t _check_mirror_pad_size + ( + vx_enum mode, + const uint32_t * front_size, + const uint32_t * back_size, + uint32_t pad_dim, + vsi_size_t *input_size, + uint32_t tensor_dim + ) +{ + uint32_t dim = pad_dim > tensor_dim ? tensor_dim : pad_dim; + uint32_t i = 0; + + for (i = 0; i < dim; i++) + { + uint32_t front = front_size[i]; + uint32_t end = back_size[i]; + uint32_t sz = (uint32_t)input_size[i]; + + if (mode == VSI_NN_PAD_MODE_SYMMETRIC) + { + if (front > sz || end > sz) + { + VSILOGE("MIRROR SYMMETRIC PAD:each padding value must be less than \ + or equal to the corresponding dimension"); + return FALSE; + } + } + else if (mode == VSI_NN_PAD_MODE_REFLECT) + { + if (front >= sz || end >= sz) + { + VSILOGE("MIRROR REFLECT PAD:each padding value must be less than \ + the corresponding dimension"); + return FALSE; + } + } + } + + return TRUE; +} + static vsi_status op_compute ( vsi_nn_node_t * self, @@ -110,6 +151,9 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + vsi_bool ret = FALSE; + vsi_nn_pad2_param *p = &self->nn_param.pad2; + BEGIN_IO_TYPE_DECL(PAD2, 1, 1) IO_TYPE(D_F32, D_F32) IO_TYPE(D_F32, D_BF16) @@ -118,7 +162,19 @@ static vsi_bool op_check IO_TYPE(D_F16, D_F16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I32, D_I32) + + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(PAD2) if (!VALIDATE_OP_IO_TYPES(PAD2, self, inputs, self->input.num, outputs, self->output.num)) { @@ -136,7 +192,10 @@ static vsi_bool op_check return FALSE; } - return TRUE; + ret = _check_mirror_pad_size(p->mode, p->front_size, p->back_size, p->dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num); + + return ret; } /* op_check() */ static vsi_bool op_setup diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c index f1386c7..399d0c6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c @@ -105,7 +105,7 @@ static vsi_bool _is_same_quant dtype = &inputs[0]->attr.dtype; _dtype = &outputs[0]->attr.dtype; - if(vsi_nn_DtypeCompare(dtype, _dtype) == FALSE) + if (vsi_nn_DtypeCompare(dtype, _dtype) == FALSE) { return FALSE; } @@ -136,7 +136,7 @@ static vsi_status op_compute self->nn_param.permute.dim_num ); - if( NULL != self->n ) + if ( NULL != self->n ) { status = VSI_SUCCESS; } @@ -153,23 +153,27 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(PERMUTE, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_I16, D_I16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F32) - IO_TYPE(D_I8|Q_SYM_PC, D_I8|Q_SYM_PC) - IO_TYPE(D_BOOL8, D_BOOL8) - IO_TYPE(D_BOOL8, D_I8|Q_DFP) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_I32, D_I32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_I16, D_I16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_SYM_PC, D_I8|Q_SYM_PC) + IO_TYPE(D_BOOL8, D_BOOL8) + IO_TYPE(D_BOOL8, D_I8|Q_DFP) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_I32, D_I32) END_IO_TYPE_DECL(PERMUTE) if (!VALIDATE_OP_IO_TYPES(PERMUTE, self, inputs, self->input.num, outputs, self->output.num)) { @@ -202,13 +206,13 @@ static vsi_bool op_setup } ret = TRUE; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; - for( i = 0; i < self->nn_param.permute.dim_num; i ++ ) + for ( i = 0; i < self->nn_param.permute.dim_num; i ++ ) { axis = self->nn_param.permute.perm[i]; - if( axis >= inputs[0]->attr.dim_num ) + if ( axis >= inputs[0]->attr.dim_num ) { VSILOGE( "Error permute axis '%u', the dim is '%u' ", axis, inputs[0]->attr.dim_num ); @@ -231,8 +235,6 @@ static vsi_status op_optimize ) { vsi_status status; - vsi_size_t shape[VSI_NN_MAX_DIM_NUM]; - uint32_t i = 0; status = VSI_SUCCESS; @@ -245,18 +247,13 @@ static vsi_status op_optimize VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); - for (i = 0; i < self->nn_param.permute.dim_num; i++) + if ( direction == VSI_NN_OPTIMIZE_BACKWARD ) { - shape[i] = inputs[0]->attr.size[self->nn_param.permute.perm[i]]; - } - - if( direction == VSI_NN_OPTIMIZE_BACKWARD ) - { - if(NULL == inputs[0]->t && NULL != outputs[0]->t) + if (NULL == inputs[0]->t && NULL != outputs[0]->t) { inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t, (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, sizeof(inputs[0]->attr.size[0]) ); - if( inputs[0]->t == NULL ) + if ( inputs[0]->t == NULL ) { status = VSI_FAILURE; } @@ -265,12 +262,17 @@ static vsi_status op_optimize } else { - if(NULL == outputs[0]->t) + if (NULL == outputs[0]->t) { - vsi_bool ret; - ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0], - shape, (vsi_size_t)self->nn_param.permute.dim_num ); - if( ret == FALSE ) + if ( NULL == inputs[0]->t ) + { + vsi_nn_TensorReinit( self->graph, inputs[0] ); + } + + outputs[0]->t = vsi_nn_safe_reshape_tensor( inputs[0]->t, + (void*)outputs[0]->attr.size, (vsi_size_t)outputs[0]->attr.dim_num, + sizeof(outputs[0]->attr.size[0]) ); + if ( outputs[0]->t == NULL ) { status = VSI_FAILURE; } @@ -278,8 +280,6 @@ static vsi_status op_optimize } } - //vsi_nn_ReshapeTensor(self->graph, inputs[0], outputs[0], shape, self->nn_param.permute.dim_num); - return status; } /* op_optimize() */ @@ -302,4 +302,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c index 67e2113..eadb94a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c @@ -78,7 +78,7 @@ static vsi_status op_compute status = VSI_FAILURE; memset( ¶ms, 0, sizeof( params ) ); - if(_is_pool1d(self, inputs)) + if (_is_pool1d(self, inputs)) { // pool1d tmp_inputs[0] = local->reshaped_input; @@ -120,7 +120,7 @@ static vsi_status op_compute tmp_outputs[0]->t ); - if( NULL != self->n ) + if ( NULL != self->n ) { status = VSI_SUCCESS; } @@ -170,11 +170,11 @@ static vsi_status op_optimize shape[3] = outputs[0]->attr.size[2]; dim = 4; local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim); - if(local->reshaped_output && local->reshaped_output->t) + if (local->reshaped_output && local->reshaped_output->t) { memset(tensor_name, 0, sizeof(tensor_name)); snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid); - if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE) + if (vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE) { VSILOGW("Set uid %u pool1d reshaped output name fail", self->uid); return VSI_FAILURE; @@ -185,7 +185,6 @@ static vsi_status op_optimize return VSI_SUCCESS; } /* op_optimize() */ - static vsi_bool op_check ( vsi_nn_node_t * self, @@ -196,32 +195,51 @@ static vsi_bool op_check /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(POOL, 1, 1) /* IO_TYPE(INPUT, OUTPUT) */ - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I16|Q_DFP) - - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_BF16, D_F32) - - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_F16) - - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) + /* HW 9.0 */ + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_F32) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_F16, D_BF16) END_IO_TYPE_DECL(POOL) - if(!VALIDATE_OP_IO_TYPES(POOL, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(POOL, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -260,21 +278,11 @@ static vsi_status op_deinit ) { vsi_nn_pool_param *p = &(self->nn_param.pool); - if(p->local->reshaped_input) - { - vsi_nn_ReleaseTensor(&(p->local->reshaped_input)); - p->local->reshaped_input = NULL; - } - if(p->local->reshaped_output) - { - vsi_nn_ReleaseTensor(&(p->local->reshaped_output)); - p->local->reshaped_output = NULL; - } - if(self->nn_param.pool.local) - { - free(self->nn_param.pool.local); - self->nn_param.pool.local = NULL; - } + + vsi_safe_release_tensor(p->local->reshaped_input); + vsi_safe_release_tensor(p->local->reshaped_output); + vsi_nn_safe_free(self->nn_param.pool.local); + vsi_nn_op_common_deinit(self); return VSI_SUCCESS; @@ -288,20 +296,20 @@ static vsi_bool op_setup ) { vsi_bool ret; - vsi_size_t ksize[_cnt_of_array(self->nn_param.pool.ksize)], i; + vsi_size_t ksize[_cnt_of_array(self->nn_param.pool.ksize)] = {0}, i = 0; vsi_size_t pad[_cnt_of_array(self->nn_param.pool.pad)] = {0}; ret = TRUE; - for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) { ksize[i] = self->nn_param.pool.ksize[i]; } - for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) { pad[i] = self->nn_param.pool.pad[i]; } - if(_is_pool1d(self, inputs)) + if (_is_pool1d(self, inputs)) { vsi_nn_compute_padding_conv1d( inputs[0]->attr.size, @@ -311,11 +319,11 @@ static vsi_bool op_setup self->nn_param.pool.pad_type, pad ); - for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) { self->nn_param.pool.ksize[i] = (uint32_t)ksize[i]; } - for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) { self->nn_param.pool.pad[i] = (uint32_t)pad[i]; } @@ -344,11 +352,11 @@ static vsi_bool op_setup self->nn_param.pool.pad_type, pad ); - for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + for (i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) { self->nn_param.pool.ksize[i] = (uint32_t)ksize[i]; } - for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + for (i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) { self->nn_param.pool.pad[i] = (uint32_t)pad[i]; } @@ -374,17 +382,13 @@ static vsi_bool op_setup self->nn_param.pool.round_type ); - outputs[0]->attr.size[2] = inputs[0]->attr.size[2]; - outputs[0]->attr.size[3] = inputs[0]->attr.size[3]; + for (i = 2; i < inputs[0]->attr.dim_num; i++) + { + outputs[0]->attr.size[i] = inputs[0]->attr.size[i]; + } } outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; - if( NULL != outputs[1] ) - { - outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; - memcpy( outputs[1]->attr.size, outputs[0]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); - } return ret; } /* op_setup() */ @@ -408,4 +412,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c index 88edb90..cfdf7c2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c @@ -51,7 +51,7 @@ static vsi_bool vsi_nn_poolwithargmax_optimize_shape ) { vsi_bool enable_image_2d = FALSE; - int32_t hwLitimLen = 65536; + int32_t hwLitimLen = GPU_TENSOR_MAX_WIDTH; if ((2 == self->nn_param.pool.ksize[1]) && (2 == self->nn_param.pool.stride[1]) @@ -166,7 +166,6 @@ static vsi_status op_compute if( ret ) { - reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, @@ -189,7 +188,6 @@ static vsi_status op_compute vsi_nn_kernel_param_release( ¶m ); return status; - } /* op_compute() */ static vsi_bool op_check @@ -233,7 +231,6 @@ static vsi_bool op_check } return TRUE; - } /* op_check() */ static vsi_bool op_setup @@ -276,6 +273,10 @@ static vsi_bool op_setup if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs ); + + outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; + memcpy( outputs[1]->attr.size, outputs[0]->attr.size, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } return ret; @@ -310,4 +311,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index eb74aff..f913afd 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -295,9 +295,7 @@ static vsi_bool op_setup { uint32_t i = 0; uint32_t axis = 2; - uint32_t group = 3; vsi_bool is_input_sep = p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ? FALSE : TRUE; - vsi_nn_tensor_t ** input_tensor_group = &p->local->local_tensor[0]; vsi_nn_internal_tensor_t * output_tensor_group[3] = {NULL}; vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL }; vsi_nn_tensor_attr_t attr; @@ -305,17 +303,6 @@ static vsi_bool op_setup vsi_size_t size_32bit[VSI_NN_MAX_DIM_NUM] = {0}; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); - - if (!is_input_sep) - { - ret = vsi_nn_CreateTensorGroup(self->graph, inputs[0], axis, - input_tensor_group, group); - if (ret == FALSE) - { - goto final; - } - } - memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t)); for(i = 0; i < p->output_attr.dim_num; i++) { @@ -361,9 +348,9 @@ static vsi_bool op_setup } else { - curr->inputs[0] = input_tensor_group[0]; - curr->inputs[1] = input_tensor_group[1]; - curr->inputs[2] = input_tensor_group[2]; + curr->inputs[0] = inputs[0]; + curr->inputs[1] = NULL; + curr->inputs[2] = NULL; } curr->outputs[0] = output_tensor_group[0]->t; curr->outputs[1] = output_tensor_group[1]->t; @@ -512,8 +499,6 @@ static vsi_bool op_setup } } -final: - return ret; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c index b7f4f1d..6d19e4a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c @@ -34,7 +34,6 @@ #include "vsi_nn_ops.h" #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "kernel/vsi_nn_kernel.h" #include "utils/vsi_nn_constraint_check.h" diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c index e0123fa..13a636d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb888_planar.c @@ -72,6 +72,7 @@ static vsi_status op_compute vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_rgb888_planar.b_mean ); vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.pre_process_rgb888_planar.scale ); vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_rgb888_planar.local->enable_copy ); + n = vsi_nn_kernel_selector( self->graph, "pre_process_rgb888_planar", inputs, 3, outputs, 3, param ); if ( n != NULL ) { @@ -94,18 +95,41 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 3) - IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8, D_U8, D_U8, D_F16, D_F16, D_F16) - END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR) - if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; + if (inputs[1] == NULL) + { + BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 1, 3) + IO_TYPE(D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8, D_F16, D_F16, D_F16) + END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR) + + if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, 1, + outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + } + else + { + BEGIN_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR, 3, 3) + IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8, D_U8, D_U8, D_F16, D_F16, D_F16) + END_IO_TYPE_DECL(PRE_PROCESS_RGB888_PLANAR) + + if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB888_PLANAR, self, inputs, self->input.num, + outputs, self->output.num)) { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } } return TRUE; @@ -192,11 +216,7 @@ static vsi_status op_deinit { vsi_status status = VSI_SUCCESS; - if (self->nn_param.pre_process_rgb888_planar.local != NULL) - { - free(self->nn_param.pre_process_rgb888_planar.local); - self->nn_param.pre_process_rgb888_planar.local = NULL; - } + vsi_nn_safe_free(self->nn_param.pre_process_rgb888_planar.local); vsi_nn_op_common_deinit(self); return status; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index 3642b47..5a37151 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -80,7 +80,7 @@ static vsi_bool caculate_reshape_size(uint32_t* dim_value, vsi_size_t* re_sizes, vsi_size_t* re_sizes2, vx_int32 *resolved_dim, vx_int32 resolved_dim_count) { -#define VSI_NN_MAX_IMAGE_WIDTH (65536) +#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH vsi_bool enable_reshape = TRUE; vsi_size_t size_count = 1; uint32_t i = 0; @@ -225,6 +225,21 @@ static vsi_status op_compute vsi_nn_tensor_t *mean_tmp_tensor = NULL; vsi_nn_tensor_t *reshaped_input1 = self->nn_param.reduce.local2->reshaped_input1; vsi_nn_tensor_t *reshaped_output1 = self->nn_param.reduce.local2->reshaped_output1; + char tensor_name[128]; + + memset(tensor_name, 0, sizeof(tensor_name)); + snprintf(tensor_name, + sizeof(tensor_name), + "uid_%u_reshape_out_0", + self->uid); + if (reshaped_output1 && vxSetReferenceName( + (vx_reference)reshaped_output1->t, tensor_name) == VSI_FAILURE) + { + VSILOGW("Set uid %u reduce reshaped output name fail", + self->uid); + return VSI_FAILURE; + } + resolved_dim_count = self->nn_param.reduce.local2->axes_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c index ebfa574..fffe060 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c @@ -69,7 +69,7 @@ static vsi_status _comparisons_op_compute inputs[1]->attr.size, inputs[1]->attr.dim_num, outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); - if( ret ) + if ( ret ) { // Add params reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, @@ -117,7 +117,7 @@ static vsi_status _comparisons_op_compute vsi_nn_kernel_param_release( ¶m ); } - if( self->n ) + if ( self->n ) { status = VSI_SUCCESS; } @@ -133,37 +133,61 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(RELATIONAL_OPS, 2, 1) - IO_TYPE(D_F16, D_F16, D_BOOL8) - IO_TYPE(D_F16, D_I16|Q_DFP, D_BOOL8) - IO_TYPE(D_F16, D_I8|Q_DFP, D_BOOL8) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_BOOL8) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_BOOL8) - IO_TYPE(D_I16|Q_DFP, D_F16, D_BOOL8) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_BOOL8) - IO_TYPE(D_I8|Q_DFP, D_F16, D_BOOL8) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_BOOL8) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_BOOL8) - IO_TYPE(D_BF16, D_BF16, D_BOOL8) - IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8) - IO_TYPE(D_F32, D_F32, D_BOOL8) - IO_TYPE(D_I32, D_I32, D_BOOL8) + IO_TYPE(D_F16, D_F16, D_BOOL8) + IO_TYPE(D_F16, D_I16|Q_DFP, D_BOOL8) + IO_TYPE(D_F16, D_I16|Q_ASYM, D_BOOL8) + IO_TYPE(D_F16, D_I16|Q_SYM, D_BOOL8) + IO_TYPE(D_F16, D_I8|Q_DFP, D_BOOL8) + IO_TYPE(D_F16, D_I8|Q_ASYM, D_BOOL8) + IO_TYPE(D_F16, D_I8|Q_SYM, D_BOOL8) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_BOOL8) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_BOOL8) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_BOOL8) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_BOOL8) + IO_TYPE(D_I16|Q_DFP, D_F16, D_BOOL8) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_BOOL8) + IO_TYPE(D_I16|Q_SYM, D_F16, D_BOOL8) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_BOOL8) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_BOOL8) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_BOOL8) + IO_TYPE(D_I8|Q_DFP, D_F16, D_BOOL8) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_BOOL8) + IO_TYPE(D_I8|Q_SYM, D_F16, D_BOOL8) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_BOOL8) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_BOOL8) + IO_TYPE(D_BF16, D_BF16, D_BOOL8) + IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8) + IO_TYPE(D_F32, D_F32, D_BOOL8) + IO_TYPE(D_I32, D_I32, D_BOOL8) - IO_TYPE(D_F16, D_F16, D_I8) - IO_TYPE(D_F16, D_I16|Q_DFP, D_I8) - IO_TYPE(D_F16, D_I8|Q_DFP, D_I8) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_I8) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I8) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I8) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_I8) - IO_TYPE(D_BF16, D_BF16, D_I8) - IO_TYPE(D_BOOL8, D_BOOL8, D_I8) - IO_TYPE(D_F32, D_F32, D_I8) - IO_TYPE(D_I32, D_I32, D_I8) + IO_TYPE(D_F16, D_F16, D_I8) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I8) + IO_TYPE(D_F16, D_I16|Q_ASYM, D_I8) + IO_TYPE(D_F16, D_I16|Q_SYM, D_I8) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8) + IO_TYPE(D_F16, D_I8|Q_ASYM, D_I8) + IO_TYPE(D_F16, D_I8|Q_SYM, D_I8) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_I8) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I8) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM, D_I8) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM, D_I8) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I8) + IO_TYPE(D_I16|Q_ASYM, D_F16, D_I8) + IO_TYPE(D_I16|Q_SYM, D_F16, D_I8) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I8) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8) + IO_TYPE(D_I8|Q_ASYM, D_F16, D_I8) + IO_TYPE(D_I8|Q_SYM, D_F16, D_I8) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_I8) + IO_TYPE(D_BF16, D_BF16, D_I8) + IO_TYPE(D_BOOL8, D_BOOL8, D_I8) + IO_TYPE(D_F32, D_F32, D_I8) + IO_TYPE(D_I32, D_I32, D_I8) END_IO_TYPE_DECL(RELATIONAL_OPS) - if(!VALIDATE_OP_IO_TYPES(RELATIONAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(RELATIONAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -174,7 +198,6 @@ static vsi_bool op_check return TRUE; } /* op_check() */ - static vsi_bool op_setup ( vsi_nn_node_t * self, @@ -190,14 +213,14 @@ static vsi_bool op_setup in2_rank = inputs[1]->attr.dim_num; out_rank = vsi_nn_max( in1_rank, in2_rank ); - for(i = 0; i < out_rank; i++) + for (i = 0; i < out_rank; i++) { vsi_size_t sz0, sz1; sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1; sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1; shape[i] = vsi_nn_max( sz0, sz1 ); } - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = (uint32_t)out_rank; memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); @@ -209,7 +232,7 @@ static vsi_bool op_setup total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, outputs[0]->attr.dim_num ); - if( total_size_expected != total_size_got ) + if ( total_size_expected != total_size_got ) { VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", total_size_expected, total_size_got); @@ -238,7 +261,6 @@ DEF_OP_REG(name, NULL, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_che DEF_COMPARISONS_OP( RELATIONAL_OPS, relational_ops ); - #undef DEF_COMPARISONS_OP #ifdef __cplusplus diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c index a10cbe6..295b6ee 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c @@ -178,15 +178,19 @@ static vsi_bool op_check vsi_nn_repeat_param * p = NULL; BEGIN_IO_TYPE_DECL(REPEAT, 2, 1) - IO_TYPE(D_F16, D_I32, D_F16) - IO_TYPE(D_F32, D_I32, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) - IO_TYPE(D_I8, D_I32, D_I8) - IO_TYPE(D_U8, D_I32, D_U8) - IO_TYPE(D_I16, D_I32, D_I16) - IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_F32, D_I32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I8, D_I32, D_I8) + IO_TYPE(D_U8, D_I32, D_U8) + IO_TYPE(D_I16, D_I32, D_I16) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I32, D_I8|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I32, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I32, D_I16|Q_SYM) END_IO_TYPE_DECL(REPEAT) if (!VALIDATE_OP_IO_TYPES(REPEAT, self, inputs, self->input.num, outputs, self->output.num)) { @@ -337,4 +341,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c index b16ba26..6ea0fc0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c @@ -54,8 +54,29 @@ static vsi_status op_compute self->nn_param.reshape.local.initialized == FALSE) { vsi_status status = VSI_SUCCESS; - vsi_nn_tensor_t *tmp_tensor = NULL; +#ifdef VX_REMOVE_RESHAPE_SUPPORT + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t *dims_tensor = NULL; + vx_nn_reshape_params_t reshape_param; + memset(&attr, 0, sizeof(attr)); + attr.size[0] = self->nn_param.reshape.dim_num; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dims_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)self->nn_param.reshape.size, + &attr); + + reshape_param.dims = REQUIRED_IO(dims_tensor); + + self->n = vxTensorReshapeNode(self->graph->g, + inputs[0]->t, &reshape_param, sizeof(reshape_param), outputs[0]->t); + vsi_safe_release_tensor(dims_tensor); +#else + vsi_nn_tensor_t *tmp_tensor = NULL; tmp_tensor = vsi_nn_reshape_tensor( self->graph, outputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num ); @@ -69,6 +90,7 @@ static vsi_status op_compute VSILOGD("Create a copy node for reshape"); vsi_safe_release_tensor(tmp_tensor); +#endif return status; } @@ -122,7 +144,9 @@ static vsi_status op_optimize vsi_status status; status = VSI_SUCCESS; - +#ifdef VX_REMOVE_RESHAPE_SUPPORT + self->nn_param.reshape.local.initialized = FALSE; +#else if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) { return status; @@ -162,7 +186,7 @@ static vsi_status op_optimize self->nn_param.reshape.local.initialized = TRUE; } } - +#endif return status; } /* op_optimize() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c index 6a84273..9deb02e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -53,15 +52,46 @@ static vsi_status op_compute if (inputs[0]->t != NULL && outputs[0]->t != NULL && self->nn_param.reshape2.local->initialized == FALSE) { +#ifdef VX_REMOVE_RESHAPE_SUPPORT + vsi_nn_tensor_attr_t attr; + vsi_nn_tensor_t *dims_tensor = NULL; + vx_nn_reshape_params_t reshape_param; + int32_t dims_data[VSI_NN_MAX_DIM_NUM] = {1}; + uint32_t i = 0; + + for (i = 0; i < self->nn_param.reshape2.dim_num; i++) + { + dims_data[i] = (int32_t)self->nn_param.reshape2.size[i]; + } + + memset(&attr, 0, sizeof(attr)); + attr.size[0] = self->nn_param.reshape2.dim_num; + attr.dim_num = 1; + attr.is_const = TRUE; + attr.dtype.vx_type = VSI_NN_TYPE_INT32; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + dims_tensor = vsi_nn_CreateTensorFromData( + self->graph, + (uint8_t *)dims_data, + &attr); + + reshape_param.dims = REQUIRED_IO(dims_tensor); + + self->n = vxTensorReshapeNode(self->graph->g, + inputs[0]->t, &reshape_param, sizeof(reshape_param), outputs[0]->t); + vsi_safe_release_tensor(dims_tensor); +#else self->n = vxTensorCopyNode(self->graph->g, inputs[0]->t, outputs[0]->t); - if(NULL == self->n) +#endif + if (NULL == self->n) { VSILOGE( "Create vxTensorCopyNode fail." ); return VSI_FAILURE; } VSILOGD("Create a copy node for reshape"); } + return VSI_SUCCESS; } /* op_compute() */ @@ -140,6 +170,9 @@ static vsi_status op_optimize vsi_status status; status = VSI_SUCCESS; +#ifdef VX_REMOVE_RESHAPE_SUPPORT + self->nn_param.reshape2.local->initialized = FALSE; +#else if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE) { return status; @@ -178,7 +211,7 @@ static vsi_status op_optimize self->nn_param.reshape2.local->initialized = TRUE; } } - +#endif return status; } /* op_optimize() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c index ad39a8b..fd544a8 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c @@ -41,15 +41,11 @@ #include "utils/vsi_nn_util.h" #include "vsi_nn_prv.h" #include "vsi_nn_log.h" -#include "libnnext/vsi_nn_vxkernel.h" #include "vsi_nn_internal_node.h" +#include "kernel/vsi_nn_kernel.h" -#define _ARG_NUM (1) #define _INPUT_NUM (1) #define _OUTPUT_NUM (1) -#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM) -#define _PARAM_NUM (_ARG_NUM + _IO_NUM) - static vsi_bool _is_same_shape ( @@ -87,24 +83,39 @@ static vsi_status op_compute } else { - vx_nn_scale_params_t para; + char kernel_name[128]; + vsi_nn_kernel_param_t * param = NULL; + int32_t align_corners = self->nn_param.resize.align_corners; + int32_t half_pixel_centers = self->nn_param.resize.half_pixel_centers; + + param = vsi_nn_kernel_param_create(); + + vsi_nn_kernel_param_add_int32( param, "align_corners", align_corners ); + vsi_nn_kernel_param_add_int32( param, "half_pixel_centers", half_pixel_centers ); + vsi_nn_kernel_param_add_int32( param, "type", self->nn_param.resize.type ); + switch (self->nn_param.resize.type) { case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR: - para.type = VX_INTERPOLATION_NEAREST_NEIGHBOR; break; + snprintf(kernel_name, sizeof(kernel_name), + "resize_nearest"); + break; case VSI_NN_INTERPOLATION_BILINEAR: - para.type = VX_INTERPOLATION_BILINEAR; break; - case VSI_NN_INTERPOLATION_AREA: - para.type = VX_INTERPOLATION_AREA; break; + snprintf(kernel_name, sizeof(kernel_name), + "resize_bilinear"); + break; default: - para.type = VX_INTERPOLATION_NEAREST_NEIGHBOR; + break; } - self->n = vxTensorScaleNode( self->graph->g, inputs[0]->t, ¶, - sizeof(vx_nn_scale_params_t), outputs[0]->t ); - if( NULL != self->n ) - { + + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, + kernel_name, &inputs[0], 1, &outputs[0], 1, param ); + + if (self->n) { status = VSI_SUCCESS; } + + vsi_nn_kernel_param_release(¶m); } return status; @@ -151,7 +162,7 @@ static vsi_bool op_setup vsi_enum layout = self->nn_param.resize.layout; vsi_nn_internal_node_t* curr = NULL; - if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) + if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; if (factor != 0) @@ -192,9 +203,7 @@ static vsi_bool op_setup } } - if ( ( self->nn_param.resize.align_corners || - self->nn_param.resize.half_pixel_centers || - layout == VSI_NN_RESIZE_LAYOUT_NHWC ) + if ( ( layout == VSI_NN_RESIZE_LAYOUT_NHWC ) && ( VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type ) ) { self->nn_param.resize.lcl_data->use_internal_node = TRUE; @@ -209,20 +218,6 @@ static vsi_bool op_setup curr->outputs[0] = outputs[0]; vsi_nn_internal_setup_node(self, curr); } - else if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers) - && (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type)) - { - self->nn_param.resize.lcl_data->use_internal_node = TRUE; - - vsi_nn_internal_init_node_wksp( self ); - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_NEAREST_INTERNAL, 0, 0 ); - curr->node->nn_param.resize_nearest_internal.align_corners = self->nn_param.resize.align_corners; - curr->node->nn_param.resize_nearest_internal.factor = self->nn_param.resize.factor; - curr->node->nn_param.resize_nearest_internal.half_pixel_centers = self->nn_param.resize.half_pixel_centers; - curr->inputs[0] = inputs[0]; - curr->outputs[0] = outputs[0]; - vsi_nn_internal_setup_node(self, curr); - } else if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num)) { self->nn_param.resize.lcl_data->use_internal_node = TRUE; @@ -242,7 +237,6 @@ static vsi_status op_deinit vsi_nn_node_t * self ) { - if (self->nn_param.resize.lcl_data->use_internal_node) { vsi_nn_safe_free(self->nn_param.resize.lcl_data); @@ -266,7 +260,7 @@ static vsi_status op_init self->nn_param.resize.lcl_data = (vsi_nn_resize_local_data *)malloc( sizeof(vsi_nn_resize_local_data) ); - if( NULL == self->nn_param.resize.lcl_data ) + if ( NULL == self->nn_param.resize.lcl_data ) { VSILOGE( "Create resize local data fail." ); status = VSI_FAILURE; @@ -274,11 +268,8 @@ static vsi_status op_init } memset( self->nn_param.resize.lcl_data, 0, sizeof(vsi_nn_resize_local_data) ); - if (vsi_nn_compareVersion(self->graph, 1, 1, 14) == -1) - { - self->nn_param.resize.align_corners = FALSE; - self->nn_param.resize.half_pixel_centers = FALSE; - } + self->nn_param.resize.align_corners = FALSE; + self->nn_param.resize.half_pixel_centers = FALSE; self->nn_param.resize.layout = VSI_NN_RESIZE_LAYOUT_NCHW; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c index a77de72..5092467 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c @@ -95,7 +95,6 @@ static vsi_status op_compute } } /* op_compute() */ - static vsi_bool op_check ( vsi_nn_node_t * self, @@ -104,33 +103,39 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(REVERSE, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_U8|Q_DFP, D_U8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I32|Q_DFP, D_I32|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) - IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) - IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM) - IO_TYPE(D_U8|Q_SYM_PC, D_U8|Q_SYM_PC) - IO_TYPE(D_I8|Q_SYM_PC, D_I8|Q_SYM_PC) - IO_TYPE(D_I16|Q_SYM_PC, D_I16|Q_SYM_PC) - IO_TYPE(D_I32|Q_SYM_PC, D_I32|Q_SYM_PC) - IO_TYPE(D_U8, D_U8) - IO_TYPE(D_I8, D_I8) - IO_TYPE(D_I16, D_I16) - IO_TYPE(D_I32, D_I32) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_F16, D_I32) - IO_TYPE(D_U8|Q_ASYM, D_I32) - IO_TYPE(D_I8|Q_DFP, D_I32) - IO_TYPE(D_I16|Q_DFP, D_I32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_U8|Q_DFP, D_U8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I32|Q_DFP, D_I32|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM) + IO_TYPE(D_U8|Q_SYM_PC, D_U8|Q_SYM_PC) + IO_TYPE(D_I8|Q_SYM_PC, D_I8|Q_SYM_PC) + IO_TYPE(D_I16|Q_SYM_PC, D_I16|Q_SYM_PC) + IO_TYPE(D_I32|Q_SYM_PC, D_I32|Q_SYM_PC) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_F16, D_I32) + IO_TYPE(D_U8|Q_ASYM, D_I32) + IO_TYPE(D_I8|Q_DFP, D_I32) + IO_TYPE(D_I16|Q_DFP, D_I32) /* HW 9.0 */ IO_TYPE(D_BF16, D_BF16) + + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(REVERSE) if(!VALIDATE_OP_IO_TYPES(REVERSE, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c index f754a67..49dbd7b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c @@ -83,10 +83,25 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + BEGIN_IO_TYPE_DECL(ROI_ALIGN, 3, 1) + IO_TYPE(D_F16, D_F16, D_I32, D_F16) + IO_TYPE(D_F16, D_F16, D_I32, D_F32) + IO_TYPE(D_F16, D_F32, D_I32, D_F16) + IO_TYPE(D_F32, D_F32, D_I32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U16|Q_ASYM, D_I32, D_U8|Q_ASYM) + END_IO_TYPE_DECL(ROI_ALIGN) + if (!VALIDATE_OP_IO_TYPES(ROI_ALIGN, self, inputs, self->input.num, outputs, self->output.num)) + { + char* desc = generate_op_io_types_desc(inputs, + self->input.num, outputs, self->output.num); + VSILOGE("Inputs/Outputs data type not support: %s", desc); + destroy_op_io_types_desc(desc); + return FALSE; + } + return TRUE; } /* op_check() */ - static vsi_bool op_setup ( vsi_nn_node_t * self, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c index 90395c0..b4c8666 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c @@ -71,14 +71,26 @@ static vsi_bool op_check BEGIN_IO_TYPE_DECL(RSQRT, 1, 1) IO_TYPE(D_F16, D_F16) IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I8|Q_SYM) IO_TYPE(D_F16, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_BF16, D_BF16) IO_TYPE(D_BF16, D_F32) IO_TYPE(D_F32, D_BF16) @@ -106,6 +118,12 @@ static vsi_bool op_check IO_TYPE(D_F16, D_BF16) IO_TYPE(D_F16, D_F32) + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(RSQRT) if(!VALIDATE_OP_IO_TYPES(RSQRT, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, @@ -136,4 +154,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c index fd01b8a..c95d75e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c @@ -118,23 +118,60 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(SELECT, 3, 1) - IO_TYPE(D_I8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8, D_F16, D_F16, D_F16) - IO_TYPE(D_I8, D_I32, D_I32, D_I32) - IO_TYPE(D_I8, D_F32, D_F32, D_F32) - IO_TYPE(D_BOOL8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_BOOL8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_BOOL8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_BOOL8, D_F16, D_F16, D_F16) - IO_TYPE(D_BOOL8, D_I32, D_I32, D_I32) - IO_TYPE(D_BOOL8, D_F32, D_F32, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8, D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8, D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8, D_I16|Q_ASYM, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I8, D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I8, D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8, D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8, D_F16, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8, D_F16, D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8, D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I8, D_F16, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I8, D_I16|Q_SYM, D_F16, D_F16) + IO_TYPE(D_I8, D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8, D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8, D_I8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8, D_I8|Q_SYM, D_F16, D_F16) + IO_TYPE(D_I8, D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8, D_I16|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I8, D_I16|Q_SYM, D_F16, D_F16) + IO_TYPE(D_I8, D_F16, D_F16, D_F16) + IO_TYPE(D_I8, D_I32, D_I32, D_I32) + IO_TYPE(D_I8, D_F32, D_F32, D_F32) + IO_TYPE(D_BOOL8, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_BOOL8, D_I8|Q_ASYM, D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I8|Q_SYM, D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_BOOL8, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_BOOL8, D_I16|Q_ASYM, D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_BOOL8, D_I16|Q_SYM, D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_BOOL8, D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_BOOL8, D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_BOOL8, D_F16, D_I8|Q_ASYM, D_F16) + IO_TYPE(D_BOOL8, D_F16, D_I8|Q_SYM, D_F16) + IO_TYPE(D_BOOL8, D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_BOOL8, D_F16, D_I16|Q_ASYM, D_F16) + IO_TYPE(D_BOOL8, D_I16|Q_SYM, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_I8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_I8|Q_SYM, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_I16|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_I16|Q_SYM, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_F16, D_F16, D_F16) + IO_TYPE(D_BOOL8, D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_BOOL8, D_I32, D_I32, D_I32) + IO_TYPE(D_BOOL8, D_F32, D_F32, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) END_IO_TYPE_DECL(SELECT) - if(!VALIDATE_OP_IO_TYPES(SELECT, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(SELECT, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -193,7 +230,6 @@ static vsi_bool op_setup return ret; } /* op_setup() */ - #ifdef __cplusplus extern "C" { #endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c index b5ef3e5..bb41e98 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c @@ -84,18 +84,28 @@ static vsi_bool op_check if (self->input.num > 1) { BEGIN_IO_TYPE_DECL(SLICE, 2, 1) - IO_TYPE(D_F16, D_I32, D_F16) - IO_TYPE(D_F16, D_I32, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I32, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I32, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I32, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_I32, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_F16, D_I32, D_F16) + IO_TYPE(D_F16, D_I32, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I32, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_I32, D_I8|Q_SYM) + IO_TYPE(D_F16, D_I32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I32, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I32, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I32, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I32, D_F16) + IO_TYPE(D_I8|Q_SYM, D_I32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I32, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I32, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I32, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I32, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I32, D_I16|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_I32, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) /* HW 9.0 */ IO_TYPE(D_BF16, D_I32, D_BF16) @@ -211,7 +221,7 @@ static vsi_status op_init { return VX_ERROR_NO_MEMORY; } - memset(p->lcl_data, 0, sizeof(vsi_nn_split_lcl_data)); + memset(p->lcl_data, 0, sizeof(vsi_nn_slice_lcl_data)); return status; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c index 257f1e2..b8cd921 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c @@ -57,37 +57,47 @@ static vsi_bool op_check /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(SOFTMAX, 1, 1) /* IO_TYPE(INPUT, OUTPUT) */ - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_F16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_F32) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_SYM) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_SYM) + IO_TYPE(D_F16, D_I8|Q_ASYM) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_BF16, D_F16) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_ASYM) - IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_BF16, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_SYM, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F32) - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_F16) - IO_TYPE(D_I8|Q_ASYM, D_F32) - - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F32) - - IO_TYPE(D_I16|Q_DFP, D_F32) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F32) + IO_TYPE(D_I16|Q_SYM, D_F32) + IO_TYPE(D_I16|Q_DFP, D_F32) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) END_IO_TYPE_DECL(SOFTMAX) - if(!VALIDATE_OP_IO_TYPES(SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(SOFTMAX, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c index 34202c3..2cba925 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth_internal.c @@ -108,13 +108,17 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(SPACE2DEPTH_INTERNAL, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) /* HW 9.0 */ IO_TYPE(D_BF16, D_BF16) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c index a510217..9810b2c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c @@ -62,7 +62,7 @@ static vsi_bool op_check /* compute the output tensor number */ num = (uint32_t)(self->output.num - 1); - while( NULL == outputs[num] ) + while ( NULL == outputs[num] ) { num --; } @@ -70,37 +70,37 @@ static vsi_bool op_check ret = TRUE; /* 1. check the input tensor number */ - if(self->input.num != 1) + if (self->input.num != 1) { VSILOGE("The split layer input num must be 1, here is %u\n", self->input.num); return FALSE; } /* 2. check output tensor number */ - if(slices_num == 0) + if (slices_num == 0) { uint32_t remaind = inputs[0]->attr.size[axis] % num; - if(remaind != 0) + if (remaind != 0) { VSILOGE("Can not average the input tensor %u shape\n", axis); return FALSE; } } - else if(slices_num != num) + else if (slices_num != num) { VSILOGE( "slices num %u != output tensor num %u\n", slices_num, num); return FALSE; } /* 3. check output tensor shape and dimensions */ - for( i = 0; i < num; i ++ ) + for ( i = 0; i < num; i ++ ) { /* the virtual tensor shape has not been calculated yet */ - if(outputs[i]->attr.vtl == TRUE + if (outputs[i]->attr.vtl == TRUE || outputs[i]->attr.dim_num == VSI_NN_DIM_AUTO) continue; - if( outputs[i]->attr.dim_num != inputs[0]->attr.dim_num ) + if ( outputs[i]->attr.dim_num != inputs[0]->attr.dim_num ) { VSILOGE( "Split dims num(%d vs %d)", outputs[i]->attr.dim_num, @@ -109,14 +109,14 @@ static vsi_bool op_check break; } - for( j = 0; j < outputs[i]->attr.dim_num; j ++ ) + for ( j = 0; j < outputs[i]->attr.dim_num; j ++ ) { - if( axis == j ) + if ( axis == j ) { continue; } - if( outputs[i]->attr.size[j] != inputs[0]->attr.size[j] ) + if ( outputs[i]->attr.size[j] != inputs[0]->attr.size[j] ) { VSILOGE( "Split dims size(%d vs %d)", outputs[i]->attr.size[j], @@ -126,12 +126,12 @@ static vsi_bool op_check } } - if( FALSE == ret ) + if ( FALSE == ret ) { break; } } - for(i = 0; i < num; i++) + for (i = 0; i < num; i++) { BEGIN_IO_TYPE_DECL(SPLIT, 1, 1) IO_TYPE(D_F16, D_F16) @@ -161,7 +161,7 @@ static vsi_bool op_check /* HW 9.0 */ IO_TYPE(D_BF16, D_BF16) END_IO_TYPE_DECL(SPLIT) - if(!VALIDATE_OP_IO_TYPES(SPLIT, self, inputs, 1, &outputs[i], 1)) { + if (!VALIDATE_OP_IO_TYPES(SPLIT, self, inputs, 1, &outputs[i], 1)) { char* desc = generate_op_io_types_desc(inputs, 1, &outputs[i], 1); VSILOGE("Inputs/Outputs data type not support: %s", desc); destroy_op_io_types_desc(desc); @@ -179,7 +179,7 @@ static vsi_bool op_setup ) { vsi_bool ret; - uint32_t i,num; + uint32_t i, num; vsi_size_t average; vsi_size_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_size_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; @@ -193,7 +193,7 @@ static vsi_bool op_setup average = 1; /* compute the output tensor number */ num = (uint32_t)(self->output.num - 1); - while( NULL == outputs[num] ) + while ( NULL == outputs[num] ) { num --; } @@ -202,7 +202,7 @@ static vsi_bool op_setup p = &(self->nn_param.split); vsi_nn_internal_init_node_wksp( self ); - if(slices_num == 0) + if (slices_num == 0) { average = inputs[0]->attr.size[axis] / num; } @@ -211,7 +211,7 @@ static vsi_bool op_setup { p->lcl_data->stride_dims[i] = 1; } - for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) { end[i] = inputs[0]->attr.size[i]; } @@ -231,7 +231,7 @@ static vsi_bool op_setup outputs[i]->attr.size[j] = inputs[0]->attr.size[j]; } outputs[i]->attr.size[axis] = end[axis] - start[axis]; - for(j = 0; j < VSI_NN_MAX_DIM_NUM; j++) + for (j = 0; j < VSI_NN_MAX_DIM_NUM; j++) { p->lcl_data->begin_dims[j] = (int32_t)start[j]; p->lcl_data->end_dims[j] = (int32_t)end[j]; @@ -368,4 +368,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c index 250f4f3..5fe93f7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include @@ -89,6 +88,8 @@ static vsi_bool op_check } } + ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs); + return ret; } /* op_check() */ @@ -191,4 +192,3 @@ DEF_OP_REG ); __END_DECLS - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index dcd34fe..aa22120 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -297,7 +297,7 @@ static vsi_status op_compute dst_tensor = p->dst_tensor ? p->dst_tensor : outputs[0]->t; p->cp_node = vxTensorCopyNode(self->graph->g, p->src_tensor, dst_tensor ); - if( NULL == p->cp_node ) + if ( NULL == p->cp_node ) { VSILOGE( "Create vxTensorCopyNode fail." ); status = VSI_FAILURE; @@ -322,7 +322,7 @@ static vsi_status op_compute self->graph, (uint8_t *)start_dims, &attr); - if( NULL == begin_dims_tensor ) + if ( NULL == begin_dims_tensor ) { VSILOGE("Create begin_dims_tensor fail.(strided_slice)"); return VSI_FAILURE; @@ -341,7 +341,7 @@ static vsi_status op_compute self->graph, (uint8_t *)stop_dims, &attr); - if( NULL == end_dims_tensor ) + if ( NULL == end_dims_tensor ) { VSILOGE("Create end_dims_tensor fail.(strided_slice)"); return VSI_FAILURE; @@ -360,7 +360,7 @@ static vsi_status op_compute self->graph, (uint8_t *)stride_dims, &attr); - if( NULL == stride_dims_tensor ) + if ( NULL == stride_dims_tensor ) { VSILOGE("Create stride_dims_tensor fail.(strided_slice)"); return VSI_FAILURE; @@ -396,7 +396,7 @@ static vsi_status op_compute } output_tensor = vsi_nn_reshape_tensor(self->graph, outputs[0], sizes, dims); - if( NULL == output_tensor ) + if ( NULL == output_tensor ) { VSILOGE("Create output_tensor fail.(strided_slice)"); return VSI_FAILURE; @@ -415,7 +415,7 @@ static vsi_status op_compute vsi_nn_ReleaseTensor(&output_tensor); } - if( NULL != self->n ) + if ( NULL != self->n ) { status = VSI_SUCCESS; } @@ -436,10 +436,18 @@ static vsi_bool op_check IO_TYPE(D_F16, D_I16|Q_DFP) IO_TYPE(D_F16, D_U8|Q_ASYM) IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_SYM, D_F16) IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_SYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) IO_TYPE(D_F32, D_F32) IO_TYPE(D_BF16, D_BF16) @@ -469,8 +477,14 @@ static vsi_bool op_check IO_TYPE(D_F16, D_F32) IO_TYPE(D_U8|Q_ASYM, D_I32|Q_ASYM) + /* HW 9.1.1 */ + IO_TYPE(D_U4|Q_ASYM, D_U4|Q_ASYM) + IO_TYPE(D_U4|Q_SYM, D_U4|Q_SYM) + IO_TYPE(D_I4|Q_ASYM, D_I4|Q_ASYM) + IO_TYPE(D_I4|Q_SYM, D_I4|Q_SYM) + END_IO_TYPE_DECL(STRIDED_SLICE) - if (!VALIDATE_OP_IO_TYPES(STRIDED_SLICE, self, inputs, self->input.num, outputs, self->output.num)) + if (!VALIDATE_OP_IO_TYPES(STRIDED_SLICE, self, inputs, 1, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); @@ -482,6 +496,46 @@ static vsi_bool op_check return TRUE; } /* op_check() */ +int32_t _reverse_mask_bits(int32_t mask, int32_t dims) +{ + int32_t i = 0; + int32_t new_mask = 0; + int32_t bits = mask; + int32_t leading_one = 0; + + for (leading_one = 0; leading_one < VSI_NN_MAX_DIM_NUM; leading_one ++) + { + if ( bits == 0 ) + { + break; + } + + bits >>= 1; + } + + dims = vsi_nn_max(dims, leading_one); + for (i = 0; i < dims; i++) + { + int32_t offset = dims - i - 1; + if (mask & (1 << i)) + { + new_mask |= (1 << offset); + } + } + + return new_mask; +} + +void _reverse_indices(int32_t *dst, const int32_t *src, int32_t dims) +{ + int32_t i = 0; + + for (i = 0; i < dims; i++) + { + dst[dims - i - 1] = src[i]; + } +} + static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_params, int32_t input_dims) { uint32_t i = 0; @@ -490,37 +544,60 @@ static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_para int32_t begin_mask = op_params->begin_mask; int32_t end_mask = op_params->end_mask; int32_t shrink_axis_mask = op_params->shrink_axis_mask; + int32_t new_axis_mask = op_params->new_axis_mask; + int32_t start_indices[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t stop_indices[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t strides[2][VSI_NN_MAX_DIM_NUM] = {{0}}; + int32_t start_mask = 0; + int32_t stop_mask = 0; + int32_t shrink_mask = 0; + int32_t output_dims = input_dims; const int32_t *begin_dims = op_params->begin_dims; const int32_t *end_dims = op_params->end_dims; const int32_t *stride_dims = op_params->stride_dims; strided_slice_param *params = &op_params->lcl2_data->params; + begin_mask = _reverse_mask_bits(begin_mask, input_dims); + end_mask = _reverse_mask_bits(end_mask, input_dims); + shrink_axis_mask = _reverse_mask_bits(shrink_axis_mask, input_dims); + _reverse_indices(start_indices[0], begin_dims, op_params->begin_dims_num); + _reverse_indices(stop_indices[0], end_dims, op_params->end_dims_num); + _reverse_indices(strides[0], stride_dims, op_params->stride_dims_num); + for (i = 0; i < op_params->begin_dims_num; i++) { if ( op_params->new_axis_mask & (1 << i)) { num_add_axis ++; + output_dims ++; } } + for (i = 0; i < (uint32_t)(input_dims + num_add_axis); i++) + { + if ( op_params->shrink_axis_mask & (1 << i)) + { + output_dims --; + } + } + + new_axis_mask = _reverse_mask_bits(new_axis_mask, output_dims); + params->num_add_axis = num_add_axis; for (i = 0; i < (uint32_t)(input_dims + num_add_axis); i++) { - if ( op_params->new_axis_mask & (1 << i) ) + if ( new_axis_mask & (1 << i) ) { continue; } else if (i >= op_params->begin_dims_num + added_ellipsis) { - params->begin_mask |= (1 << params->begin_dims_num); - params->end_mask |= (1 << params->end_dims_num); - params->begin_dims[params->begin_dims_num ++ ] = - 0; - params->end_dims[params->end_dims_num ++] = - 0; - params->stride_dims[params->stride_dims_num ++] = - 1; + start_mask |= (1 << params->begin_dims_num); + stop_mask |= (1 << params->end_dims_num); + start_indices[1][params->begin_dims_num ++ ] = 0; + stop_indices[1][params->end_dims_num ++] = 0; + strides[1][params->stride_dims_num ++] = 1; } else { @@ -528,28 +605,32 @@ static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_para if (begin_mask & (1 << orig_idx)) { - params->begin_mask |= (1 << params->begin_dims_num); + start_mask |= (1 << params->begin_dims_num); } if (end_mask & (1 << orig_idx)) { - params->end_mask |= (1 << params->end_dims_num); + stop_mask |= (1 << params->end_dims_num); } if (shrink_axis_mask & (1 << orig_idx)) { - params->shrink_axis_mask |= (1 << params->begin_dims_num); + shrink_mask |= (1 << params->begin_dims_num); } - params->begin_dims[params->begin_dims_num ++] = - begin_dims[orig_idx]; - params->end_dims[params->end_dims_num ++] = - end_dims[orig_idx]; - params->stride_dims[params->stride_dims_num ++] = - stride_dims[orig_idx]; + start_indices[1][params->begin_dims_num ++] = start_indices[0][orig_idx]; + stop_indices[1][params->end_dims_num ++] = stop_indices[0][orig_idx]; + strides[1][params->stride_dims_num ++] = strides[0][orig_idx]; } } + params->begin_mask = _reverse_mask_bits(start_mask, input_dims); + params->end_mask = _reverse_mask_bits(stop_mask, input_dims); + params->shrink_axis_mask = _reverse_mask_bits(shrink_mask, input_dims); + _reverse_indices(params->begin_dims, start_indices[1], params->begin_dims_num); + _reverse_indices(params->end_dims, stop_indices[1], params->end_dims_num); + _reverse_indices(params->stride_dims, strides[1], params->stride_dims_num); + return TRUE; } @@ -678,7 +759,7 @@ static vsi_status op_optimize VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); - if( NULL == inputs[0]->t ) + if ( NULL == inputs[0]->t ) { vsi_nn_TensorReinit( self->graph, inputs[0] ); } @@ -687,7 +768,7 @@ static vsi_status op_optimize memcpy( start, (vsi_size_t*)start_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); memcpy( end, (vsi_size_t*)stop_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]); - if( NULL == in_view_tensor ) + if ( NULL == in_view_tensor ) { VSILOGE( "Create tensor %d from view fail.", i ); status = VSI_FAILURE; @@ -697,12 +778,12 @@ static vsi_status op_optimize self->nn_param.strided_slice.lcl2_data->is_optimized = TRUE; is_same_quant_type = _is_same_quant(inputs, outputs); - if( NULL != outputs[0]->t || is_same_quant_type == FALSE) + if ( NULL != outputs[0]->t || is_same_quant_type == FALSE) { VSILOGI( "stride slice copy tensor."); // Copy old tensor values to the new address. status = copy_tensor_to_view( self, in_view_tensor, outputs[0]); - if( VSI_FAILURE == status ) + if ( VSI_FAILURE == status ) { goto OnError; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c b/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c index 7900d88..2aaeb36 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_swish.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include @@ -77,32 +76,11 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(SWISH, 1, 1) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_I32, D_I32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) - END_IO_TYPE_DECL(SWISH) - if(!VALIDATE_OP_IO_TYPES(SWISH, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + vsi_bool ret = FALSE; - return TRUE; + ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs); + + return ret; } /* op_check() */ static vsi_bool op_setup @@ -153,4 +131,3 @@ DEF_OP_REG ); __END_DECLS - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c index b2c13dd..647396f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include "vsi_nn_types.h" @@ -57,7 +56,6 @@ static vsi_status _tile_op_compute &inputs[0], 1, &outputs[0], 1, NULL ); - if( self->n ) { status = VSI_SUCCESS; @@ -77,17 +75,21 @@ static vsi_bool op_check vsi_nn_tile_param * p; BEGIN_IO_TYPE_DECL(TILE, 1, 1) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_I32, D_I32) - IO_TYPE(D_U32, D_U32) - IO_TYPE(D_F32, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) + IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM) + IO_TYPE(D_I16|Q_SYM, D_I16|Q_SYM) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_U32, D_U32) + IO_TYPE(D_F32, D_F32) END_IO_TYPE_DECL(TILE) - if(!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); @@ -160,4 +162,3 @@ DEF_TILE_OP( TILE, tile ); #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c index 34becd3..1923b26 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c @@ -53,7 +53,7 @@ static vsi_bool vsi_nn_upsample_optimize_shape ) { vsi_bool enable_image_2d = FALSE; - vsi_ssize_t hwLitimLen = 65536; + vsi_ssize_t hwLitimLen = GPU_TENSOR_MAX_WIDTH; if ((2 == self->nn_param.upsample.scale[0]) && (2 == self->nn_param.upsample.scale[1])) @@ -166,7 +166,6 @@ static vsi_status op_compute if( ret ) { - reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, @@ -311,4 +310,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index ade122c..b782511 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -444,6 +444,13 @@ static _op_param_gen_t s_op_gen[] = /* GATHER_ELEMENTS */ NULL, /* SELU */ NULL, /* CELU */ NULL, + /* MAX_POOL3D */ NULL, + /* RCP */ NULL, + /* SIGN */ NULL, + /* SOFTSIGN */ NULL, + /* CUMSUM */ NULL, + /* MAXPOOLWITHARGMAX */ NULL, + /* MOD */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c index 92dedcc..2e6b26e 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c @@ -31,7 +31,7 @@ #include "kernel/vsi_nn_kernel.h" #define DEF_DTYPE_CONVERT_NORMAL(SRC_NAME, SRC_DTYPE, DST_NAME, DST_DTYPE) \ -static inline void _convert_##SRC_NAME##_to_##DST_NAME \ +static VSI_INLINE_API void _convert_##SRC_NAME##_to_##DST_NAME \ ( \ const SRC_DTYPE * buffer, \ size_t size, \ @@ -60,7 +60,7 @@ DEF_DTYPE_CONVERT_NORMAL( float, float, uint32, uint32_t ) DEF_DTYPE_CONVERT_NORMAL( float, float, uint16, uint16_t ) #undef DEF_DTYPE_CONVERT_NORMAL -static inline void _convert_float16_to_float +static VSI_INLINE_API void _convert_float16_to_float ( const vsi_float16 * buffer, size_t size, @@ -74,7 +74,7 @@ static inline void _convert_float16_to_float } } /* _convert_float16_to_float */ -static inline void _convert_float_to_float16 +static VSI_INLINE_API void _convert_float_to_float16 ( const float * buffer, size_t size, @@ -88,7 +88,7 @@ static inline void _convert_float_to_float16 } } /* _convert_float_to_float16 */ -static inline void _convert_bfloat16_to_float +static VSI_INLINE_API void _convert_bfloat16_to_float ( const vsi_bfloat16 * buffer, size_t size, @@ -102,7 +102,7 @@ static inline void _convert_bfloat16_to_float } } /* _convert_bfloat16_to_float */ -static inline void _convert_float_to_bfloat16 +static VSI_INLINE_API void _convert_float_to_bfloat16 ( const float * buffer, size_t size, @@ -163,7 +163,7 @@ DEF_DTYPE_CONVERT_QUANTIZE( symm16, int16_t, vsi_rtne, SHRT_MIN, SHRT_MAX ) DEF_DTYPE_CONVERT_QUANTIZE( symm32, int32_t, vsi_rtne, INT_MIN, INT_MAX ) DEF_DTYPE_CONVERT_QUANTIZE( symm64, int64_t, vsi_rtne, LLONG_MIN, LLONG_MAX ) DEF_DTYPE_CONVERT_QUANTIZE( asymm8, uint8_t, vsi_rtne, 0, UCHAR_MAX ) -//DEF_DTYPE_CONVERT_QUANTIZE( asymm16, uint16_t, vsi_rtne, 0, USHRT_MAX ) +DEF_DTYPE_CONVERT_QUANTIZE( asymm16, uint16_t, vsi_rtne, 0, USHRT_MAX ) //DEF_DTYPE_CONVERT_QUANTIZE( asymm32, uint32_t, vsi_rtne, 0, UINT_MAX ) #undef DEF_DTYPE_CONVERT_QUANTIZE @@ -419,6 +419,9 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float case U8: return vsi_nn_dtype_convert_quantize_asymm8_to_float( (const uint8_t *)buffer, size, scale, zero_point, out_buffer ); + case U16: + return vsi_nn_dtype_convert_quantize_asymm16_to_float( + (const uint16_t*)buffer, size, scale, zero_point, out_buffer); case I8: return vsi_nn_dtype_convert_quantize_symm8_to_float( (const int8_t *)buffer, size, scale, zero_point, out_buffer ); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c index 3c45846..6547f46 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c @@ -447,34 +447,66 @@ vsi_bool vsi_nn_DtypeCompare vsi_nn_dtype_t *dtype1 ) { - if(NULL == dtype0 || NULL == dtype1) + if (NULL == dtype0 || NULL == dtype1) { return FALSE; } - if(dtype0->vx_type != dtype1->vx_type || dtype0->qnt_type != dtype1->qnt_type) + if ( dtype0->vx_type != dtype1->vx_type || + dtype0->qnt_type != dtype1->qnt_type ) { return FALSE; } - if(dtype0->qnt_type == VSI_NN_QNT_TYPE_DFP) + + switch (dtype0->qnt_type) { - if(dtype0->fl != dtype1->fl) + case VSI_NN_QNT_TYPE_DFP: + if (dtype0->fl != dtype1->fl) + { + return FALSE; + } + break; + case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: + case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: { - return FALSE; + const float diff = (float)1e-5; + if (dtype0->zero_point != dtype1->zero_point) + { + return FALSE; + } + if (vsi_nn_float_compare(dtype0->scale, dtype1->scale, diff) + == FALSE) + { + return FALSE; + } + + break; } - } - else if( dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC || - dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) - { - const float diff = (float)1e-5; - if(dtype0->zero_point != dtype1->zero_point) + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC: { - return FALSE; - } - if(vsi_nn_float_compare(dtype0->scale, dtype1->scale, diff) == FALSE) - { - return FALSE; + const float diff = (float)1e-5; + int32_t i = 0; + int32_t scale_cnt0 = dtype0->scale_dim; + int32_t scale_cnt1 = dtype1->scale_dim; + + if (scale_cnt0 == scale_cnt1) + { + const float* src_scale_ptr = dtype0->scales; + const float* dst_scale_ptr = dtype1->scales; + for (i = 0; i < scale_cnt0; i++) + { + if (vsi_nn_float_compare(src_scale_ptr[i],dst_scale_ptr[i], diff) + == FALSE) + { + return FALSE; + } + } + } + break; } + default: + break; } return TRUE; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index bd14b39..25ffab7 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -28,7 +28,7 @@ #include #include -#ifdef _WIN32 +#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) #include #include #else @@ -112,7 +112,7 @@ char* vsi_nn_strncpy ) { char* ret = NULL; - #ifdef _MSC_VER + #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) strncpy_s(dest, count, source, _TRUNCATE); #else strncpy(dest, source, count); @@ -128,7 +128,7 @@ char* vsi_nn_strncat ) { char* ret = NULL; - #ifdef _MSC_VER + #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) strncat_s(dest, count, source, _TRUNCATE); ret = dest; #else @@ -143,7 +143,7 @@ char* vsi_nn_getenv ) { char* var = NULL; - #ifdef _MSC_VER + #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) size_t var_size = 0; _dupenv_s(&var, &var_size, var_name); #else @@ -159,7 +159,7 @@ FILE* vsi_nn_fopen ) { FILE * file = NULL; - #ifdef _MSC_VER + #if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) fopen_s(&file, file_name, mode); #else file = fopen(file_name, mode); @@ -795,7 +795,7 @@ int32_t vsi_nn_Access return -1; } -#ifdef _WIN32 +#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) return _access(path, mode); #else return access(path, mode); @@ -813,7 +813,7 @@ int32_t vsi_nn_Mkdir return -1; } -#ifdef _WIN32 +#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) return _mkdir(path); #else return mkdir(path, mode); @@ -1128,59 +1128,67 @@ vsi_bool vsi_nn_is_same_quant_type( vsi_nn_tensor_t * dst ) { - vx_bool result = FALSE; + vsi_nn_dtype_t *src_dtype = NULL, *dst_dtype = NULL; - if (src->attr.dtype.vx_type == dst->attr.dtype.vx_type) + src_dtype = &src->attr.dtype; + dst_dtype = &dst->attr.dtype; + + if (src_dtype->qnt_type != dst_dtype->qnt_type) { - switch (src->attr.dtype.qnt_type) - { - case VSI_NN_QNT_TYPE_NONE: - result = TRUE; - break; + return FALSE; + } + switch (src_dtype->qnt_type) + { case VSI_NN_QNT_TYPE_DFP: - if (src->attr.dtype.fl == dst->attr.dtype.fl) + if (src_dtype->fl != dst_dtype->fl) { - result = TRUE; + return FALSE; } break; case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: - if (src->attr.dtype.scale == dst->attr.dtype.scale && - src->attr.dtype.zero_point == dst->attr.dtype.zero_point) + { + const float diff = (float)1e-5; + if (src_dtype->zero_point != dst_dtype->zero_point) { - result = TRUE; + return FALSE; + } + if (vsi_nn_float_compare(src_dtype->scale, dst_dtype->scale, diff) + == FALSE) + { + return FALSE; } break; - + } case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: + case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC: + { + const float diff = (float)1e-5; + int32_t i = 0; + int32_t scale_cnt0 = src_dtype->scale_dim; + int32_t scale_cnt1 = dst_dtype->scale_dim; + + if (scale_cnt0 == scale_cnt1) { - int32_t i = 0; - int32_t scale_cnt0 = src->attr.dtype.scale_dim; - int32_t scale_cnt1 = dst->attr.dtype.scale_dim; - - if (scale_cnt0 == scale_cnt1) + const float* src_scale_ptr = src_dtype->scales; + const float* dst_scale_ptr = dst_dtype->scales; + for (i = 0; i < scale_cnt0; i++) { - const float *src_scale_ptr = src->attr.dtype.scales; - const float *dst_scale_ptr = dst->attr.dtype.scales; - for (i = 0; i < scale_cnt0; i++) + if (vsi_nn_float_compare( + src_scale_ptr[i], dst_scale_ptr[i], diff) == FALSE) { - if (src_scale_ptr[i] != dst_scale_ptr[i]) - break; + return FALSE; } - - if (i == scale_cnt0) - result = TRUE; } } break; - + } default: break; - } } - return result; + return TRUE; } vsi_bool vsi_nn_is_same_type @@ -1220,6 +1228,67 @@ vsi_bool vsi_nn_is_broadcast_operaton return FALSE; } +vsi_bool vsi_nn_is_broadcast_axes_operaton + ( + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t * output, + int32_t * axis, + int32_t axis_num + ) +{ + vsi_size_t out_rank = output->attr.dim_num; + vsi_size_t i = 0; + + if (vsi_nn_is_broadcast_operaton(inputs, input_num, output) == FALSE) + { + return FALSE; + } + + for (i = 0; i < out_rank; i++) + { + size_t j = 0; + int32_t k = 0; + vsi_size_t src0_size = i < inputs[0]->attr.dim_num ? + inputs[0]->attr.size[i] : 1; + + for (k = 0; k < axis_num; k++) + { + if (axis[k] == (int32_t)i) + { + for (j = 1; j < input_num; j++) + { + vsi_size_t src_size = i < inputs[j]->attr.dim_num ? + inputs[j]->attr.size[i] : 1; + + if (src0_size == src_size) + { + return FALSE; + } + } + + break; + } + } + + if (axis[k] == (int32_t)i) + { + continue; + } + + for (j = 1; j < input_num; j++) + { + vsi_size_t src_size = i < inputs[j]->attr.dim_num ? inputs[j]->attr.size[i] : 1; + + if (src0_size != src_size) + { + return FALSE; + } + } + } + return TRUE; +} + float vsi_nn_get_tensor_scale ( vsi_nn_tensor_t * tensor diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c index acd5b4f..cbddf2d 100644 --- a/src/tim/vx/internal/src/vsi_nn_context.c +++ b/src/tim/vx/internal/src/vsi_nn_context.c @@ -62,6 +62,8 @@ static vsi_status query_hardware_caps sizeof(vx_hardware_caps_params_ext2_t)); context->config.support_stream_processor = paramExt.supportStreamProcessor; context->config.sp_exec_count = paramExt2.streamProcessorExecCount; + context->config.sp_vector_depth = paramExt2.streamProcessorVectorSize; + context->config.sp_per_core_vector_depth = context->config.sp_vector_depth / context->config.sp_exec_count; #endif #endif diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index ee81ac1..535f595 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -1875,7 +1875,6 @@ final: return status; } /* vsi_nn_TrySetupCompleteSignalNode() */ - /* * Documented in vsi_nn_graph.h */ @@ -1884,7 +1883,7 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs vsi_nn_graph_t* graph ) { - uint32_t i,j; + uint32_t i,j,k,p; vsi_status status = VSI_FAILURE; uint32_t num_of_graph_inputs; uint32_t num_of_graph_real_inputs; @@ -1911,6 +1910,33 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs ;//do nothing } } + /*update inputs for nbg node who has crop scalar parameter as inputs*/ + for (i = 0; i < graph->node_num; i++) + { + vsi_nn_node_t* node = vsi_nn_GetNode(graph, i); + uint32_t numParams = 0; + if (node->op == VSI_NN_OP_NBG) + { + status = vxQueryNode( + node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams)); + for (j = 0; j < numParams; j++) + { + vx_parameter param = 0; + vx_enum type = 0; + param = vxGetParameterByIndex(node->n, j); + status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + num_of_graph_real_inputs++; + } + if (param != NULL) + { + vxReleaseParameter(¶m); + param = NULL; + } + } + } + } graph_inputs = (vx_reference *)malloc( num_of_graph_real_inputs * sizeof( vx_reference ) ); CHECK_PTR_FAIL_GOTO( graph_inputs, "Create buffer fail.", final ); for( i = 0, j = 0; i < num_of_graph_inputs; i++ ) @@ -1924,6 +1950,52 @@ vsi_status vsi_nn_setup_binary_graph_inputs_outputs goto final; } graph_inputs[j++] = (vx_reference)( tensor->t ); + for (k = 0; k < graph->node_num; k++) + { + vsi_nn_node_t* node = vsi_nn_GetNode(graph, k); + if (node->op == VSI_NN_OP_NBG) + { + vx_parameter param = 0; + vx_reference ref = 0; + vx_enum type = 0; + uint32_t scalar_index = j; + param = vxGetParameterByIndex(node->n, scalar_index); + status = vxQueryParameter(param, + VX_PARAMETER_TYPE, + &type, + sizeof(vx_enum)); + if (param != NULL) + { + vxReleaseParameter(¶m); + param = NULL; + } + if (type != VX_TYPE_SCALAR) + { + break; + } + for (p = scalar_index; p < scalar_index+4; p++) + { + param = vxGetParameterByIndex(node->n, p); + status = vxQueryParameter(param, + VX_PARAMETER_TYPE, + &type, + sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + vxQueryParameter(param, + VX_PARAMETER_REF, + &ref, + sizeof(vx_reference)); + graph_inputs[j++] = ref; + vxReleaseReference(&ref); + } + if (param != NULL) + { + vxReleaseParameter(¶m); + } + } + } + } } else { diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c index 1f46c3f..1845bc7 100644 --- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -195,6 +195,9 @@ static _node_template s_template[] = /* GRU */ NULL, /* GRUCELL */ NULL, /* GRUCELL_ACTIVATION */ NULL, + /* CUMSUM */ NULL, + /* MAXPOOLWITHARGMAX */ NULL, + /* MOD */ NULL, }; //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index 3b10e44..10f25ac 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -835,15 +835,15 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam if (node->op == VSI_NN_OP_PRE_PROCESS && node->nn_param.pre_process.type != VSI_NN_SOURCE_FORMAT_TENSOR) { - if(node->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR) - { + //if(node->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR) + //{ /* 2 additional input tensors and 4 paramter scalar*/ - num_of_graph_real_inputs += 6; - } - else - { - num_of_graph_real_inputs += 4; - } + // num_of_graph_real_inputs += 6; + //} + //else + //{ + num_of_graph_real_inputs += 4; + //} } } } @@ -885,10 +885,10 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam break; } } - if (!processed && enabled) + if (!processed) { processed_node_id_list[processed_idx++] = node->uid; - if (node->op == VSI_NN_OP_PRE_PROCESS) + if (enabled) { vx_node prenode = NULL; vx_uint32 numParams = 0; @@ -1028,3 +1028,82 @@ final: } return status; } /* vs_nn_AddBinaryGraphInputsWithCropParam() */ + +vsi_status vsi_nn_UpdateCropParamsForBinaryGraph +( + vsi_nn_graph_t* graph, + uint32_t enabled_crop_input_idx, + uint32_t start_x, + uint32_t start_y, + uint32_t crop_w, + uint32_t crop_h, + uint32_t dst_w, + uint32_t dst_h +) +{ + uint32_t i, j; + uint32_t numParams = 0; + int32_t scalar_value[4] = {0}; + vsi_status status = VSI_FAILURE; + uint32_t input_idx = enabled_crop_input_idx; + scalar_value[0] = (int32_t)((crop_w << 15) / dst_w); + scalar_value[1] = (int32_t)((crop_h << 15) / dst_h); + scalar_value[2] = start_x; /*rgb start_x*3, rgb start_x*4*/ + scalar_value[3] = start_y; + + for (i = 0; i < graph->node_num; i++) + { + vsi_nn_node_t* node = vsi_nn_GetNode(graph, i); + if (node->op == VSI_NN_OP_NBG) + { + vx_parameter param = 0; + vx_enum type = 0; + vx_reference ref = 0; + uint32_t scalar_idx = 0; + uint32_t scalar_value_idx = 0; + int32_t temp_value = 0; + status = vxQueryNode(node->n, VX_NODE_PARAMETERS, &numParams, sizeof(numParams)); + for (j = 0; j < numParams; j++) + { + + param = vxGetParameterByIndex(node->n, j); + status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + scalar_idx = j; + break; + } + } + while (input_idx > 0) + { + uint32_t tensor_idx = scalar_idx + 4; + for (j = tensor_idx; j < numParams; j++) + { + param = vxGetParameterByIndex(node->n, j); + status = vxQueryParameter( + param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + scalar_idx = j; + break; + } + } + input_idx--; + } + for (j = scalar_idx; j < scalar_idx + 4; j++) + { + temp_value = scalar_value[scalar_value_idx++]; + param = vxGetParameterByIndex(node->n, j); + status = vxQueryParameter(param, VX_PARAMETER_TYPE, &type, sizeof(vx_enum)); + if (type == VX_TYPE_SCALAR) + { + status = vxQueryParameter(param, VX_PARAMETER_REF, &ref, sizeof(vx_reference)); + status = vxWriteScalarValue((vx_scalar)ref, &temp_value); + status = vxSetParameterByIndex(node->n, j, ref); + } + } + + } + } + return status; +} /* vsi_nn_UpdateCropParamsForBinaryGraph() */ diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c index 8fa073c..b3f8800 100644 --- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c +++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c @@ -60,7 +60,7 @@ vsi_bool vsi_nn_rnn_find_best_kernel_size /* try NxN */ if( !multi_batch ) { - #if( !defined( _WIN32 ) ) + #if( !(defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32)) ) /* try NxN conv */ kernel_h = 8; while( input_size % (kernel_h * kernel_h) != 0 ) @@ -958,12 +958,16 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute { vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_tensor_t* tensor0 = NULL; - uint32_t* permute_in_perm = NULL; + uint32_t i = 0, * permute_in_perm = NULL; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0); permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(uint32_t)); - memcpy(permute_in_perm, perm, dim_num * sizeof(uint32_t)); + + for (i = 0; i < dim_num; i++) + { + permute_in_perm[i] = (uint32_t)perm[i]; + } curr->node->nn_param.permute.perm = permute_in_perm; curr->node->nn_param.permute.dim_num = (uint32_t)dim_num; curr->inputs[0] = input_tensor; diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index 18c964e..54236c0 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -428,7 +428,7 @@ static vsi_bool _init_tensor if ( TRUE == tensor->attr.is_dummy ) { tensor->t = vxCreateDummyTensor( graph->ctx->c, - (vsi_size_t)tensor->attr.dim_num, tensor->attr.size, (vsi_enum)tensor->attr.dtype.vx_type ); + (vsi_size_t)tensor->attr.dim_num, size_vxsize, (vsi_enum)tensor->attr.dtype.vx_type ); } else #endif if( TRUE == tensor->attr.is_created_from_handle )