Update internal for 22Q2 release (#432)
* Update internal for 22Q2 release update to internal commit-id: e96103281b08404cabb9b65306587627cfa3cb93 Signed-off-by: yuenan.li <yuenan.li@verisilicon.com> * Update prebuilt for 22Q2 release Signed-off-by: yuenan.li <yuenan.li@verisilicon.com> Co-authored-by: yuenan.li <yuenan.li@verisilicon.com>
This commit is contained in:
parent
9f331ed5ec
commit
7d88a668e3
|
|
@ -336,3 +336,4 @@ ASALocalRun/
|
|||
# IDE
|
||||
.settings/
|
||||
build/
|
||||
*_build/
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
REL/6.4.10.2
|
||||
6.4.11
|
||||
|
|
|
|||
|
|
@ -499,6 +499,8 @@ enum vx_kernel_e {
|
|||
|
||||
VX_KERNEL_NN_DECONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x31,
|
||||
|
||||
VX_KERNEL_STREAM_PROCESSOR = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x32,
|
||||
|
||||
VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -196,4 +196,45 @@ VX_DECONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support d
|
|||
#define VX_TENSOR_STRIDE_X_BITS_SUPPORT 1
|
||||
#endif
|
||||
|
||||
/*
|
||||
VX_REMOVE_RESHAPE_SUPPORT is used to declare if graph opt support to remove reshape op, if support, it's not need to remove reshape in ovxlib.
|
||||
0: not support
|
||||
1: support
|
||||
*/
|
||||
/*
|
||||
#ifndef VX_REMOVE_RESHAPE_SUPPORT
|
||||
#define VX_REMOVE_RESHAPE_SUPPORT 0
|
||||
#endif
|
||||
*/
|
||||
|
||||
/*
|
||||
VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can support vxStreamProcessorNode API
|
||||
[value]
|
||||
0: not support
|
||||
1: support
|
||||
*/
|
||||
#ifndef VX_STREAM_PROCESSOR_SUPPORT
|
||||
#define VX_STREAM_PROCESSOR_SUPPORT 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL is used to declare that this tensor connect to fixed DMA channel.
|
||||
[value]
|
||||
0: not support
|
||||
1: support
|
||||
*/
|
||||
#ifndef VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL
|
||||
#define VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL 1
|
||||
#endif
|
||||
|
||||
/*
|
||||
VX_SCALE_EXTRA_PARAMETER_SUPPORT is used to declare that RESIZE can support align_cornor and half_pixel_center parameter
|
||||
[value]
|
||||
0: not support
|
||||
1: support
|
||||
*/
|
||||
#ifndef VX_SCALE_EXTRA_PARAMETER_SUPPORT
|
||||
#define VX_SCALE_EXTRA_PARAMETER_SUPPORT 1
|
||||
#endif
|
||||
|
||||
#endif /* __VX_KHR_COMPATIBLE_H__ */
|
||||
|
|
|
|||
|
|
@ -57,6 +57,12 @@ enum vx_graph_attribute_internal_type_e
|
|||
VX_GRAPH_AXI_SRAM_PRE_LOAD = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x2,
|
||||
/*! \brief Queries a graph for its running priority (read-write. Use a <tt>\ref vx_uint32</tt> parameter. */
|
||||
VX_GRAPH_PRIORITY_VALUE_VIV = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x3,
|
||||
VX_GRAPH_PSI_EXTRATOR_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x4,
|
||||
VX_GRAPH_PSI_FILLER_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x5,
|
||||
VX_GRAPH_DENOISE_POSTPROCESS_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x6,
|
||||
VX_GRAPH_DATA_COMPRESSION_RATIO = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x7,
|
||||
VX_GRAPH_ISP_EMULATION_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x8,
|
||||
VX_GRAPH_PROCESS_FPS = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x9,
|
||||
};
|
||||
|
||||
/*! \brief Size Alignment of User Memory
|
||||
|
|
@ -209,7 +215,8 @@ enum vx_nn_activation_function_e
|
|||
VX_NN_ACTIVATION_LEAKYRELU_MAX_POOLING = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x4,
|
||||
VX_NN_ACTIVATION_SWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x5,
|
||||
VX_NN_ACTIVATION_HSWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x6,
|
||||
VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
|
||||
VX_NN_ACTIVATION_CUSTOM = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
|
||||
VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x8,
|
||||
};
|
||||
|
||||
/*! \brief The Convolutional network type
|
||||
|
|
@ -285,6 +292,59 @@ enum vx_tensor_rank_type_e
|
|||
VX_TENSOR_RANK_SN,
|
||||
};
|
||||
|
||||
/*! \brief The attribute of tensor.
|
||||
* \ingroup group_tensor
|
||||
* \version 0.4
|
||||
*/
|
||||
enum vx_tensor_priority_e
|
||||
{
|
||||
/*! \brief no special requirement */
|
||||
VX_TENSOR_DEFAULT = 0,
|
||||
|
||||
/*! \brief 2nd input(reference) */
|
||||
/*VX_TENSOR_2ND_INPUT_FOR = 1,*/
|
||||
VX_TENSOR_FOR_GRAPH_REFERENCE = 1,
|
||||
};
|
||||
|
||||
|
||||
/*! \brief The attribute of tensor memory.
|
||||
* \ingroup group_tensor
|
||||
* \version 0.4
|
||||
*/
|
||||
enum vx_tensor_memory_attribute_e
|
||||
{
|
||||
/*! \brief no special requirement */
|
||||
VX_TENSOR_MEMORY_DEFAULT = 0,
|
||||
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_0 = (0x1 << 0),
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_1 = (0x1 << 1),
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_2 = (0x1 << 2),
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_3 = (0x1 << 3),
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_4 = (0x1 << 4),
|
||||
/*
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_5 = (0x1 << VX_DMA5_IN_ISP_OCM_PSI),
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_6 = (0x1 << VX_DMA6_DDR_DECOMPRESS),
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_7 = (0x1 << VX_DMA7_POSTOUT_OCM_ISP),
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_8 = (0x1 << VX_DMA8_COMPRESS_DDR),
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_9 = (0x1 << VX_DMA9_ISP_PATTERN_GENERATOR),
|
||||
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_10 = (0x1 << VX_DMA10_ISP_CHECKSUM_GENERATOR),
|
||||
*/
|
||||
/*! \brief DMA transfer data to VIP and enable circular buffer */
|
||||
#if !VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL
|
||||
VX_TENSOR_MEMORY_ENABLE_CIRCULAR_BY_DMA = 0xFFFFFFFF,
|
||||
#endif
|
||||
};
|
||||
|
||||
enum vx_dma_extrator_pad_mode_e
|
||||
{
|
||||
/*! \brief no special requirement */
|
||||
VX_DMA_EXTRATOR_PAD_CONST = 0,
|
||||
|
||||
/*! \brief DMA extrator pad with nearest edge */
|
||||
VX_DMA_EXTRATOR_PAD_WITH_NEAREAST_EDGE = 1,
|
||||
};
|
||||
|
||||
|
||||
/*! \brief The precision of tensor.
|
||||
* \ingroup group_tensor
|
||||
* \version 0.4
|
||||
|
|
@ -601,6 +661,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_int32* n
|
|||
*/
|
||||
VX_API_ENTRY vx_status VX_API_CALL vxSetTensorAttribute(vx_tensor tensor, vx_enum attribute, const void *ptr, vx_size size);
|
||||
|
||||
/*! \brief Creates an opaque reference to a tensor data buffer.
|
||||
* \details The tensor is a dummy tensor which will not allocate any memory. And it cannot reshape or view.
|
||||
* Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
|
||||
* \param [in] context The reference to the implementation context.
|
||||
* \param [in] number_of_dims The number of dimensions.
|
||||
* \param [in] dims Dimensions sizes in elements.
|
||||
* \param [in] data_format The <tt>\ref vx_type_e</tt> that represents the data format of the tensor data elements.
|
||||
* \return A tensor data reference or zero when an error is encountered.
|
||||
* \ingroup group_tensor
|
||||
* \version 0.3
|
||||
*/
|
||||
VX_API_ENTRY vx_tensor VX_API_CALL vxCreateDummyTensor(vx_context context, vx_size number_of_dims, const vx_size *dims, vx_enum data_format);
|
||||
|
||||
|
||||
/*! \brief The type enumeration lists all NN extension types.
|
||||
* \ingroup group_cnn
|
||||
|
|
@ -1317,6 +1390,13 @@ typedef struct _vx_nn_scale_params_t
|
|||
vx_enum type; /*!< \brief The interpolation type, only support VX_INTERPOLATION_BILINEAR. */
|
||||
} vx_nn_scale_params_t, * vx_nn_scale_params;
|
||||
|
||||
typedef struct _vx_nn_scale_params_ext_t
|
||||
{
|
||||
vx_nn_scale_params_t base;
|
||||
vx_bool align_corners;
|
||||
vx_bool half_pixel_centers;
|
||||
} vx_nn_scale_params_ext_t, * vx_nn_scale_params_ext;
|
||||
|
||||
/*! \brief [Graph] Creates a scale Layer Node.
|
||||
* \param [in] graph The reference to the parent graph.
|
||||
* \param [in] input The input tensor data to scale.
|
||||
|
|
@ -2054,8 +2134,15 @@ typedef struct _vx_hardware_caps_params_ext_t
|
|||
vx_hardware_caps_params_t base;
|
||||
vx_uint32 subGroupSize; /*!< \brief shader sub-group size.*/
|
||||
vx_bool supportVA40; /*!< \brief support 40bit virtual address.*/
|
||||
vx_uint32 supportStreamProcessor; /*!< \brief support stream processor.*/
|
||||
} vx_hardware_caps_params_ext_t;
|
||||
|
||||
typedef struct _vx_hardware_caps_params_ext2_t
|
||||
{
|
||||
vx_hardware_caps_params_ext_t base;
|
||||
vx_uint32 streamProcessorExecCount; /*!< \brief streamprocess execution count. */
|
||||
} vx_hardware_caps_params_ext2_t;
|
||||
|
||||
/*! \brief Queries hardware caps information.
|
||||
* \param [in] context The reference to the context.
|
||||
* \param [in] hardware_caps_params <tt>\ref vx_hardware_caps_params_t </tt>.
|
||||
|
|
|
|||
|
|
@ -219,6 +219,15 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext4_t
|
|||
vx_bool enable_nn_tensor_add_relu; /*!< \brief Enable Relu function after tensor add. */
|
||||
} vx_nn_convolution_relu_pooling_params_ext4_t, * vx_nn_convolution_relu_pooling_params_ext4;
|
||||
|
||||
typedef struct _vx_nn_convolution_relu_pooling_params_ext5_t
|
||||
{
|
||||
vx_nn_convolution_relu_pooling_params_ext4_t ext4; /*!< \brief convolution relu pooling params <tt>\ref vx_nn_convolution_relu_pooling_params_ext_t</tt> */
|
||||
|
||||
vx_object_array inputs_list;
|
||||
vx_object_array outputs_list;
|
||||
vx_spinst spinst_obj;
|
||||
} vx_nn_convolution_relu_pooling_params_ext5_t, * vx_nn_convolution_relu_pooling_params_ext5;
|
||||
|
||||
/*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
|
||||
* \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer.
|
||||
* For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
|
||||
|
|
|
|||
|
|
@ -963,6 +963,40 @@ VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmNode(vx_graph graph,
|
|||
vx_scalar trans_c,
|
||||
vx_tensor output);
|
||||
|
||||
typedef struct _vx_lut_params_s
|
||||
{
|
||||
vx_enum lut_function; /*!< \brief Set VX_NN_ACTIVATION_NONE to disable lut table or set VX_NN_ACTIVATION_CUSTOM to customize lut table or set others to use fixed lut table */
|
||||
vx_float32 float_values[4]; /*!< \brief Float parameters of fixed lut table */
|
||||
vx_uint32 fvalues_count; /*!< \brief Count of float_values */
|
||||
vx_int32 int_values[4]; /*!< \brief Int parameters of fixed lut table */
|
||||
vx_uint32 ivalues_count; /*!< \brief Count of int_values */
|
||||
vx_lut in_lut; /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */
|
||||
vx_lut out_lut; /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */
|
||||
} vx_lut_params_s, * vx_lut_params;
|
||||
|
||||
/*! \brief Create a stream processor node.
|
||||
* \param [in] graph The reference to the graph.
|
||||
* \param [in] input_list The input tensor list.
|
||||
* \param [in] input_count The input tensor count.
|
||||
* \param [in] output_list The output tensor list.
|
||||
* \param [in] output_count The output tensor count.
|
||||
* \param [in] spinst_obj The stream processor instrunction object. Use vxCreateSPINST() to create.
|
||||
* \param [in] lut_params The lut parameters. Refer to vx_lut_params_s.
|
||||
* \return <tt>\ref vx_node</tt>.
|
||||
* \retval vx_node A node reference. Any possible errors preventing a successful creation
|
||||
* should be checked using <tt>\ref vxGetStatus</tt>
|
||||
* \ingroup group_vision_function_sp
|
||||
*/
|
||||
VX_API_ENTRY vx_node VX_API_CALL vxStreamProcessorNode(
|
||||
vx_graph graph,
|
||||
vx_tensor* input_list,
|
||||
vx_uint32 input_count,
|
||||
vx_tensor* output_list,
|
||||
vx_uint32 output_count,
|
||||
vx_spinst spinst_obj,
|
||||
vx_lut_params lut_params
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,332 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright 2017 - 2021 Vivante Corporation, Santa Clara, California.
|
||||
* All Rights Reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* 'Software'), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sub license, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject
|
||||
* to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the
|
||||
* next paragraph) shall be included in all copies or substantial
|
||||
* portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
|
||||
* IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VX_SPINST_H_
|
||||
#define _VX_SPINST_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef enum _vx_sp_inst_type_e
|
||||
{
|
||||
VX_SP_INST_TYPE_FADD,
|
||||
VX_SP_INST_TYPE_FMULT,
|
||||
VX_SP_INST_TYPE_MOVE,
|
||||
VX_SP_INST_TYPE_PWL,
|
||||
|
||||
VX_SP_INST_TYPE_COUNT,
|
||||
}
|
||||
vx_sp_inst_type_e;
|
||||
|
||||
typedef enum _vx_sp_inst_type_fadd_e
|
||||
{
|
||||
VX_SP_INST_TYPE_FADD_IDLE, // FADD-IDLE
|
||||
VX_SP_INST_TYPE_FADD_ADD, // dst = src0 + src1
|
||||
VX_SP_INST_TYPE_FADD_SUB, // dst = src0 - src1
|
||||
|
||||
VX_SP_INST_TYPE_FADD_COUNT,
|
||||
}
|
||||
vx_sp_inst_type_fadd_e;
|
||||
|
||||
typedef enum _vx_sp_inst_type_fmult_e
|
||||
{
|
||||
VX_SP_INST_TYPE_FMULT_IDLE, /* FMULT-IDLE */
|
||||
VX_SP_INST_TYPE_FMULT_MUL, /* dst = src0 * src1 */
|
||||
VX_SP_INST_TYPE_FMULT_MUL_CLAMP, /* dst = clamp (src0, src1, R6, R7) */
|
||||
|
||||
VX_SP_INST_TYPE_FMULT_COUNT,
|
||||
}
|
||||
vx_sp_inst_type_fmult_e;
|
||||
|
||||
typedef enum _vx_sp_inst_type_move_e
|
||||
{
|
||||
VX_SP_INST_TYPE_MOVE_IDLE,
|
||||
VX_SP_INST_TYPE_MOVE_MOVE, // dst = src1
|
||||
VX_SP_INST_TYPE_MOVE_SEL0, // dst = (src0 > 0) ? src1[0] : src1[1]
|
||||
VX_SP_INST_TYPE_MOVE_SEL1, // dst = (src0 > 0) ? src1 : FA-src0 // use FA's SRC0
|
||||
VX_SP_INST_TYPE_MOVE_IMMD, // dst = Constant assign immmediate
|
||||
VX_SP_INST_TYPE_MOVE_ABS, // dst = abs(src1)
|
||||
|
||||
VX_SP_INST_TYPE_MOVE_COUNT,
|
||||
}
|
||||
vx_sp_inst_type_move_e;
|
||||
|
||||
typedef enum _vx_sp_inst_type_pwl_e
|
||||
{
|
||||
VX_SP_INST_TYPE_PWL_IDLE,
|
||||
VX_SP_INST_TYPE_PWL_SETUP_0, /* PWL ID = 0 */
|
||||
VX_SP_INST_TYPE_PWL_SETUP_1, /* Sigmode() */
|
||||
VX_SP_INST_TYPE_PWL_SETUP_2, /* Tanh() */
|
||||
|
||||
VX_SP_INST_TYPE_PWL_COUNT,
|
||||
}
|
||||
vx_sp_inst_type_pwl_e;
|
||||
|
||||
typedef enum _vx_sp_inst_src_dst_e
|
||||
{
|
||||
VX_SP_INST_SPINOUT,
|
||||
VX_SP_INST_SR1,
|
||||
VX_SP_INST_SR2,
|
||||
VX_SP_INST_SR3,
|
||||
VX_SP_INST_SR4,
|
||||
VX_SP_INST_SR5,
|
||||
VX_SP_INST_SR6, /* nn_clamp_min */
|
||||
VX_SP_INST_SR7, /* nn_clamp_max */
|
||||
VX_SP_INST_SR8,
|
||||
VX_SP_INST_SR9,
|
||||
VX_SP_INST_SR10,
|
||||
VX_SP_INST_VR11,
|
||||
VX_SP_INST_VR12,
|
||||
VX_SP_INST_VR13,
|
||||
VX_SP_INST_VR14,
|
||||
VX_SP_INST_SETUPOUT, /* Input of PWL Mult and Add: FMInA, FMInB, FAInA, FAInB */
|
||||
}
|
||||
vx_sp_inst_src_dst_e;
|
||||
|
||||
typedef struct _vx_spinst_unit_param
|
||||
{
|
||||
vx_enum op; /* vx_sp_inst_type_e */
|
||||
|
||||
struct
|
||||
{
|
||||
vx_enum op; /* vx_sp_inst_type_fadd/fmult/move/pwl_e */
|
||||
|
||||
struct
|
||||
{
|
||||
vx_uint8 src0; /* vx_sp_inst_src_dst_e */
|
||||
vx_uint8 src1; /* vx_sp_inst_src_dst_e */
|
||||
vx_uint8 dst; /* vx_sp_inst_src_dst_e */
|
||||
vx_float32 constant;
|
||||
} var;
|
||||
|
||||
} sub;
|
||||
|
||||
}
|
||||
vx_spinst_unit_param;
|
||||
|
||||
/**********************************************************************************************/
|
||||
|
||||
typedef enum _vx_sp_attribute_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_NONE,
|
||||
|
||||
VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING,
|
||||
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_X,
|
||||
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Y,
|
||||
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Z,
|
||||
|
||||
VX_SP_ATTRIBUTE_PROG_INIT_INSTR_NUM,
|
||||
VX_SP_ATTRIBUTE_PROG_LOOP_INSTR_NUM,
|
||||
VX_SP_ATTRIBUTE_PROG_COMPLETE_INSTR_NUM,
|
||||
VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE,
|
||||
VX_SP_ATTRIBUTE_INPUT_SETUP,
|
||||
|
||||
VX_SP_ATTRIBUTE_IGNORED_LEADING_OUTPUTS,
|
||||
VX_SP_ATTRIBUTE_FLUSH_CYCLE_NUM,
|
||||
VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_WR,
|
||||
VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_WR,
|
||||
VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_RD,
|
||||
VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_RD,
|
||||
|
||||
VX_SP_ATTRIBUTE_CH0_POST_REDISTRIBUTE,
|
||||
VX_SP_ATTRIBUTE_CH1_POST_REDISTRIBUTE,
|
||||
VX_SP_ATTRIBUTE_V11_RESET_AT_START,
|
||||
VX_SP_ATTRIBUTE_V12_RESET_AT_START,
|
||||
VX_SP_ATTRIBUTE_V11_POP_CONFIG,
|
||||
VX_SP_ATTRIBUTE_V12_POP_CONFIG,
|
||||
VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT,
|
||||
VX_SP_ATTRIBUTE_IGNORED_LEADING_ACC_OUT,
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_RESET,
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL,
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE,
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE,
|
||||
|
||||
VX_SP_ATTRIBUTE_GENERAL_COUNT,
|
||||
|
||||
VX_SP_ATTRIBUTE_CONST0, /* NN post multiplier */
|
||||
VX_SP_ATTRIBUTE_CONST1, /* NN neg pos multiplier */
|
||||
VX_SP_ATTRIBUTE_CONST2, /* NN tensor add const */
|
||||
VX_SP_ATTRIBUTE_CONST3, /* NN clamp max */
|
||||
VX_SP_ATTRIBUTE_CONST4, /* NN clmap min */
|
||||
|
||||
VX_SP_ATTRIBUTE_TOTAL_COUNT,
|
||||
}
|
||||
vx_sp_attribute_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_input_tile_mapping_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_XYMERGE,
|
||||
VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_YZMERGE,
|
||||
}
|
||||
vx_sp_attribute_input_tile_mapping_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_output_collapse_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_DISABLED,
|
||||
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_ENABLED,
|
||||
}
|
||||
vx_sp_attribute_output_collapse_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_rounding_mode_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_RTNE,
|
||||
VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_STICKY,
|
||||
}
|
||||
vx_sp_attribute_rounding_mode_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_input_setup_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_INPUT_SETUP_SINGLE_INPUT,
|
||||
VX_SP_ATTRIBUTE_INPUT_SETUP_INTERLEAVE_TWO_INPUTS,
|
||||
VX_SP_ATTRIBUTE_INPUT_SETUP_V11,
|
||||
VX_SP_ATTRIBUTE_INPUT_SETUP_V12,
|
||||
}
|
||||
vx_sp_attribute_input_setup_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_ch_post_redistribute_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_DISABLED,
|
||||
VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_SCALAR_GATHER,
|
||||
VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_GATHER,
|
||||
VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_SCATTER,
|
||||
}
|
||||
vx_sp_attribute_ch_post_redistribute_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_v_reset_at_start_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_V_RESET_AT_START_NONE,
|
||||
VX_SP_ATTRIBUTE_V_RESET_AT_START_RESET,
|
||||
}
|
||||
vx_sp_attribute_v_reset_at_start_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_v_pop_config_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_READ,
|
||||
VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_ROW,
|
||||
}
|
||||
vx_sp_attribute_v_pop_config_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_accelerator_input_select_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_OUTPUT,
|
||||
VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_ACCLERATOR,
|
||||
}
|
||||
vx_sp_attribute_accelerator_input_select_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_sum_engine_reset_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_NONE,
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_RESET,
|
||||
}
|
||||
vx_sp_attribute_sum_engine_reset_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_sum_engine_control_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_INTERNAL,
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_1D,
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_2D,
|
||||
}
|
||||
vx_sp_attribute_sum_engine_control_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_sum_engine_num_ch_minus_one_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_ONE_CH,
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_TWO_CH,
|
||||
}
|
||||
vx_sp_attribute_sum_engine_num_ch_minus_one_e;
|
||||
|
||||
typedef enum _vx_sp_attribute_sum_engine_2d_accum_storage_e
|
||||
{
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_SAME,
|
||||
VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_DIFFERENT,
|
||||
}
|
||||
vx_sp_attribute_sum_engine_2d_accum_storage_e;
|
||||
|
||||
/**********************************************************************************************/
|
||||
|
||||
/*! \brief Creates an opaque reference to a spinst data.
|
||||
* \param [in] context The reference to the implementation context.
|
||||
* \return A spinst data reference.
|
||||
* \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
|
||||
* \ingroup group_object_spinst
|
||||
*/
|
||||
VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST(
|
||||
vx_context context
|
||||
);
|
||||
|
||||
/*! \brief Releases a reference to a spinst object.
|
||||
* The object may not be garbage collected until its total reference count is zero.
|
||||
* \param [in] spinst_obj The pointer to the spinst data to release.
|
||||
* \post After returning from this function the reference is zeroed.
|
||||
* \return A <tt>\ref vx_status_e</tt> enumeration.
|
||||
* \retval VX_SUCCESS No errors; all other values indicate failure
|
||||
* \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
|
||||
* \ingroup group_object_spinst
|
||||
*/
|
||||
VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST(
|
||||
vx_spinst *spinst_obj
|
||||
);
|
||||
|
||||
/*! \brief Add a instruction to spinst object.
|
||||
* \param [in] spinst_obj The reference to the spinst object.
|
||||
* \param [in] inst_unit_array The units of one instruction. Use a <tt>\ref vx_spinst_unit_param</tt>.
|
||||
* \param [in] inst_unit_count The count of instruction units.
|
||||
* \return A <tt>\ref vx_status_e</tt> enumeration.
|
||||
* \retval VX_SUCCESS No errors.
|
||||
* \retval VX_ERROR_INVALID_REFERENCE If data is not a <tt>\ref spinst_obj</tt>.
|
||||
* \retval VX_ERROR_INVALID_PARAMETERS If any of parameters is incorrect.
|
||||
* \retval VX_ERROR_NO_MEMORY If fail to allocate internal instruction memory.
|
||||
* \ingroup group_object_spinst
|
||||
*/
|
||||
VX_API_ENTRY vx_status VX_API_CALL vxAddOneInstToSPINST(
|
||||
vx_spinst spinst_obj,
|
||||
vx_spinst_unit_param* inst_unit_array,
|
||||
vx_uint8 inst_unit_count
|
||||
);
|
||||
|
||||
/*! \brief Set various attributes of a spinst data.
|
||||
* \param [in] spinst_obj The reference to the vx_spinst object to set.
|
||||
* \param [in] attribute The attribute to set. Use a <tt>\ref vx_sp_attribute_e</tt>.
|
||||
* \param [in] value The value of attribute.
|
||||
* \return A <tt>\ref vx_status_e</tt> enumeration.
|
||||
* \retval VX_SUCCESS No errors.
|
||||
* \retval VX_ERROR_INVALID_REFERENCE If data is not a <tt>\ref vx_spinst</tt>.
|
||||
* \retval VX_ERROR_INVALID_PARAMETERS If any of attribute is incorrect.
|
||||
* \ingroup group_object_spinst
|
||||
*/
|
||||
VX_API_ENTRY vx_status VX_API_CALL vxSetAttributeToSPINST(
|
||||
vx_spinst spinst_obj,
|
||||
vx_enum attribute,
|
||||
vx_uint32 value
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -342,6 +342,10 @@ typedef struct _vx_tensorpatch_addressing_t * vx_trensor_addressing;
|
|||
*/
|
||||
typedef struct _vx_weights_biases_parameter_s * vx_weights_biases_parameter;
|
||||
|
||||
/*! \brief The object for stream processor
|
||||
* \ingroup group_spinst
|
||||
*/
|
||||
typedef struct _vx_spinst_s * vx_spinst;
|
||||
|
||||
/*! \brief A Boolean value.
|
||||
* This allows 0 to be FALSE, as it is in C, and any non-zero to be TRUE.
|
||||
|
|
@ -470,6 +474,7 @@ enum vx_type_e {
|
|||
/* \todo add new object types here */
|
||||
VX_TYPE_BFLOAT16 = 0x81A,/*!< \brief A <tt>\ref vx_bfloat16</tt>. */
|
||||
|
||||
VX_TYPE_SPINST = 0x81B,/*!< \brief A <tt>\ref vx_spinst</tt>. */
|
||||
VX_TYPE_INT4 = 0x81C,/*!< \brief A <tt>\ref signed 4bits tensor.</tt>. */
|
||||
VX_TYPE_UINT4 = 0x81D,/*!< \brief A <tt>\ref unsigned 4bits tensor.</tt>. */
|
||||
};
|
||||
|
|
@ -1021,6 +1026,8 @@ enum vx_node_attribute_e {
|
|||
|
||||
VX_NODE_ATTRIBUTE_CONST_TENSOR_CACHE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x9,
|
||||
|
||||
VX_NODE_ATTRIBUTE_FOR_HW_QUALITY = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xA,
|
||||
|
||||
};
|
||||
|
||||
/*! \brief The parameter attributes list
|
||||
|
|
@ -1290,6 +1297,9 @@ enum vx_tensor_attribute_e
|
|||
VX_TENSOR_LIFETIME = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x5,
|
||||
/*! \brief the value status of tensor. */
|
||||
VX_TENSOR_VALUE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x6,
|
||||
/*XiaoMi project*/
|
||||
VX_TENSOR_INPUT_FOR_REFERENCE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x7,
|
||||
VX_TENSOR_MEMORY_ATTRIBUTE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x8,
|
||||
};
|
||||
|
||||
/*! \brief The meta valid rectangle attributes.
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -1 +0,0 @@
|
|||
libOpenVX.so.1.3.0
|
||||
Binary file not shown.
|
|
@ -1 +0,0 @@
|
|||
libOpenVX.so.1.3.0
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -172,3 +172,10 @@ DEF_OP(PRE_PROCESS_RGB888_PLANAR)
|
|||
DEF_OP(GATHER_ELEMENTS)
|
||||
DEF_OP(SELU)
|
||||
DEF_OP(CELU)
|
||||
DEF_OP(MAX_POOL3D)
|
||||
DEF_OP(RCP)
|
||||
DEF_OP(SIGN)
|
||||
DEF_OP(SOFTSIGN)
|
||||
DEF_OP(CUMSUM)
|
||||
DEF_OP(MAXPOOLWITHARGMAX)
|
||||
DEF_OP(MOD)
|
||||
|
|
|
|||
|
|
@ -25,10 +25,13 @@
|
|||
#ifndef _VSI_NN_GPU_CONFIG_H
|
||||
#define _VSI_NN_GPU_CONFIG_H
|
||||
|
||||
#define GPU_TENSOR_MAX_WIDTH (65536)
|
||||
#ifdef VSI_40BIT_VA_SUPPORT
|
||||
#define GPU_TENSOR_MAX_WIDTH (1 << 30)
|
||||
#else
|
||||
#define GPU_TENSOR_MAX_WIDTH (1 << 16)
|
||||
#endif
|
||||
#define GPU_MAX_MULTIPLIER_NUM (65535)
|
||||
#define GPU_MAX_POST_SHIFT_BITS (31)
|
||||
#define GPU_TENSOR_DIM_2 (2)
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -156,6 +156,8 @@ typedef struct
|
|||
vsi_nn_kernel_quant_asymm_t asymm;
|
||||
vsi_nn_kernel_quant_asymm_perchannel_t asymm_v;
|
||||
};
|
||||
float scale;
|
||||
int32_t zero_point;
|
||||
} vsi_nn_kernel_tensor_attr_t;
|
||||
|
||||
typedef struct
|
||||
|
|
@ -411,7 +413,7 @@ vsi_status vsi_nn_kernel_node_pass_param
|
|||
size_t num
|
||||
);
|
||||
|
||||
static inline void vsi_nn_kernel_node_release
|
||||
static VSI_INLINE_API void vsi_nn_kernel_node_release
|
||||
(
|
||||
vsi_nn_kernel_node_t * node
|
||||
)
|
||||
|
|
@ -422,7 +424,7 @@ static inline void vsi_nn_kernel_node_release
|
|||
}
|
||||
}
|
||||
|
||||
static inline void vsi_nn_kernel_node_pack_io
|
||||
static VSI_INLINE_API void vsi_nn_kernel_node_pack_io
|
||||
(
|
||||
vsi_nn_kernel_node_param_t * params,
|
||||
size_t param_num,
|
||||
|
|
@ -476,7 +478,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
|
|||
);
|
||||
|
||||
/** Map data type to gpu internal dtype. */
|
||||
static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
|
||||
static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
|
||||
(
|
||||
vsi_nn_type_e dtype
|
||||
)
|
||||
|
|
@ -516,7 +518,7 @@ static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
|
|||
return I8;
|
||||
} /* vsi_nn_kernel_map_dtype() */
|
||||
|
||||
static inline vsi_nn_type_e vsi_nn_dtype_map_kernel
|
||||
static VSI_INLINE_API vsi_nn_type_e vsi_nn_dtype_map_kernel
|
||||
(
|
||||
vsi_nn_kernel_dtype_e dtype
|
||||
)
|
||||
|
|
@ -556,7 +558,7 @@ static inline vsi_nn_type_e vsi_nn_dtype_map_kernel
|
|||
return VSI_NN_TYPE_INT8;
|
||||
} /* vsi_nn_kernel_map_dtype() */
|
||||
|
||||
static inline size_t vsi_nn_kernel_dtype_get_bytes
|
||||
static VSI_INLINE_API size_t vsi_nn_kernel_dtype_get_bytes
|
||||
(
|
||||
vsi_nn_kernel_dtype_e dtype
|
||||
)
|
||||
|
|
@ -585,7 +587,7 @@ static inline size_t vsi_nn_kernel_dtype_get_bytes
|
|||
return 0;
|
||||
} /* vsi_nn_kernel_dtype_get_bytes() */
|
||||
|
||||
static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
|
||||
static VSI_INLINE_API vsi_size_t vsi_nn_kernel_dtype_get_bits
|
||||
(
|
||||
vsi_nn_kernel_dtype_e dtype
|
||||
)
|
||||
|
|
@ -617,7 +619,7 @@ static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
|
|||
return 0;
|
||||
} /* vsi_nn_kernel_dtype_get_bits() */
|
||||
|
||||
static inline vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
|
||||
static VSI_INLINE_API vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
|
||||
( vsi_nn_qnt_type_e quant_type )
|
||||
{
|
||||
switch( quant_type )
|
||||
|
|
@ -658,7 +660,7 @@ vsi_nn_kernel_scalar_t vsi_nn_kernel_scalar_create
|
|||
const void * data
|
||||
);
|
||||
|
||||
static inline void vsi_nn_kernel_scalar_release
|
||||
static VSI_INLINE_API void vsi_nn_kernel_scalar_release
|
||||
( vsi_nn_kernel_scalar_t * scalar )
|
||||
{
|
||||
if( scalar && *scalar )
|
||||
|
|
@ -803,7 +805,7 @@ vsi_status vsi_nn_kernel_tensor_write
|
|||
size_t size
|
||||
);
|
||||
|
||||
static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
|
||||
static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_size
|
||||
( const vsi_nn_kernel_tensor_attr_t * attr )
|
||||
{
|
||||
if( !attr )
|
||||
|
|
@ -813,7 +815,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
|
|||
return vsi_nn_shape_get_size( attr->shape->data, (vsi_size_t)attr->shape->size );
|
||||
} /* vsi_nn_kernel_tensor_attr_get_size() */
|
||||
|
||||
static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
|
||||
static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
|
||||
( const vsi_nn_kernel_tensor_attr_t * attr )
|
||||
{
|
||||
vsi_size_t i = 0;
|
||||
|
|
@ -851,7 +853,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
|
|||
return bytes;
|
||||
} /* vsi_nn_kernel_tensor_attr_get_bytes() */
|
||||
|
||||
static inline void vsi_nn_kernel_tensor_attr_get_stride
|
||||
static VSI_INLINE_API void vsi_nn_kernel_tensor_attr_get_stride
|
||||
( const vsi_nn_kernel_tensor_attr_t * attr, vsi_size_t * out_stride)
|
||||
{
|
||||
vsi_size_t type_bits;
|
||||
|
|
@ -902,7 +904,7 @@ static inline void vsi_nn_kernel_tensor_attr_get_stride
|
|||
}
|
||||
} /* vsi_nn_kernel_tensor_attr_get_size() */
|
||||
|
||||
static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
|
||||
static VSI_INLINE_API vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
|
||||
( const vsi_nn_kernel_tensor_attr_t * attr )
|
||||
{
|
||||
return ( attr && attr->quant > VSI_NN_KERNEL_QUANT_NONE
|
||||
|
|
@ -1072,7 +1074,7 @@ OVXLIB_API vsi_status vsi_nn_KernelGpuConfig
|
|||
const gpu_param_t * gpu_param
|
||||
);
|
||||
|
||||
static inline const char* vsi_nn_kernel_type_str
|
||||
static VSI_INLINE_API const char* vsi_nn_kernel_type_str
|
||||
(
|
||||
vsi_nn_kernel_type_e type
|
||||
)
|
||||
|
|
@ -1095,7 +1097,7 @@ static inline const char* vsi_nn_kernel_type_str
|
|||
return "None";
|
||||
} /* vsi_nn_kernel_type_str() */
|
||||
|
||||
static inline vsi_status vsi_nn_kernel_unpack_4bit_data
|
||||
static VSI_INLINE_API vsi_status vsi_nn_kernel_unpack_4bit_data
|
||||
(
|
||||
const vsi_nn_kernel_tensor_attr_t * attr,
|
||||
uint8_t * src,
|
||||
|
|
@ -1162,7 +1164,7 @@ static inline vsi_status vsi_nn_kernel_unpack_4bit_data
|
|||
return status;
|
||||
}
|
||||
|
||||
static inline vsi_status vsi_nn_kernel_pack_4bit_data
|
||||
static VSI_INLINE_API vsi_status vsi_nn_kernel_pack_4bit_data
|
||||
(
|
||||
const vsi_nn_kernel_tensor_attr_t * attr,
|
||||
uint8_t * src,
|
||||
|
|
|
|||
|
|
@ -46,6 +46,8 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
|
|||
VSI_NN_KERNEL_LUT_CLIP = 12,
|
||||
VSI_NN_KERNEL_LUT_SQUARE = 13,
|
||||
VSI_NN_KERNEL_LUT_CELU = 14,
|
||||
VSI_NN_KERNEL_LUT_RCP = 15,
|
||||
VSI_NN_KERNEL_LUT_SOFTSIGN = 16,
|
||||
};
|
||||
|
||||
#define VSI_NN_KERNEL_LUT_MAX_SIZE (1024)
|
||||
|
|
|
|||
|
|
@ -30,11 +30,20 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_crop_lcl_data
|
||||
{
|
||||
vx_int32 begin_dims[VSI_NN_MAX_DIM_NUM];
|
||||
vx_int32 end_dims[VSI_NN_MAX_DIM_NUM];
|
||||
vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM];
|
||||
} vsi_nn_crop_lcl_data;
|
||||
|
||||
typedef struct _vsi_nn_crop_param
|
||||
{
|
||||
int32_t axis;
|
||||
uint32_t dims;
|
||||
uint32_t offset[VSI_NN_MAX_DIM_NUM];
|
||||
|
||||
vsi_nn_crop_lcl_data *lcl_data;
|
||||
} vsi_nn_crop_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#ifndef _VSI_NN_OP_CUMSUM_H
|
||||
#define _VSI_NN_OP_CUMSUM_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_cumsum_param
|
||||
{
|
||||
int32_t axis;
|
||||
vsi_bool exclusive;
|
||||
vsi_bool reverse;
|
||||
} vsi_nn_cumsum_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_MAX_POOL3D_H
|
||||
#define _VSI_NN_OP_MAX_POOL3D_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_max_pool3d_param
|
||||
{
|
||||
struct _max_pool3d_local_data_t* local;
|
||||
// Add parameters here
|
||||
|
||||
/* round_type is used to calculate the output shape */
|
||||
vsi_nn_round_type_e round_type;
|
||||
uint32_t ksize[3];
|
||||
uint32_t stride[3];
|
||||
/* Pad left, right, top, bottom */
|
||||
uint32_t pad[6];
|
||||
/* Pad type default value shall be AUTO */
|
||||
vsi_nn_pad_e pad_type;
|
||||
} vsi_nn_max_pool3d_param;
|
||||
_compiler_assert(offsetof(vsi_nn_max_pool3d_param, local) == 0, \
|
||||
vsi_nn_max_pool3d_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_MOD_H
|
||||
#define _VSI_NN_OP_MOD_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_mod_param
|
||||
{
|
||||
int32_t fmod;
|
||||
} vsi_nn_mod_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_RCP_H
|
||||
#define _VSI_NN_OP_RCP_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_rcp_param
|
||||
{
|
||||
struct _rcp_local_data_t* local;
|
||||
// Add parameters here
|
||||
} vsi_nn_rcp_param;
|
||||
_compiler_assert(offsetof(vsi_nn_rcp_param, local) == 0, \
|
||||
vsi_nn_rcp_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_SIGN_H
|
||||
#define _VSI_NN_OP_SIGN_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_sign_param
|
||||
{
|
||||
struct _sign_local_data_t* local;
|
||||
// Add parameters here
|
||||
} vsi_nn_sign_param;
|
||||
_compiler_assert(offsetof(vsi_nn_sign_param, local) == 0, \
|
||||
vsi_nn_sign_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_SOFTSIGN_H
|
||||
#define _VSI_NN_OP_SOFTSIGN_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_softsign_param
|
||||
{
|
||||
struct _softsign_local_data_t* local;
|
||||
// Add parameters here
|
||||
} vsi_nn_softsign_param;
|
||||
_compiler_assert(offsetof(vsi_nn_softsign_param, local) == 0, \
|
||||
vsi_nn_softsign_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -32,7 +32,7 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
static inline vsi_bool type_is_integer
|
||||
static VSI_INLINE_API vsi_bool type_is_integer
|
||||
(
|
||||
const vsi_nn_type_e type
|
||||
)
|
||||
|
|
@ -60,7 +60,7 @@ static inline vsi_bool type_is_integer
|
|||
return ret;
|
||||
} /* type_is_integer() */
|
||||
|
||||
static inline vsi_bool type_is_signed
|
||||
static VSI_INLINE_API vsi_bool type_is_signed
|
||||
(
|
||||
const vsi_nn_type_e type
|
||||
)
|
||||
|
|
@ -86,7 +86,7 @@ static inline vsi_bool type_is_signed
|
|||
return ret;
|
||||
} /* type_is_signed() */
|
||||
|
||||
static inline uint32_t type_get_bytes
|
||||
static VSI_INLINE_API uint32_t type_get_bytes
|
||||
(
|
||||
const vsi_nn_type_e type
|
||||
)
|
||||
|
|
@ -115,7 +115,7 @@ static inline uint32_t type_get_bytes
|
|||
}
|
||||
} /* type_get_bytes() */
|
||||
|
||||
static inline uint32_t type_get_bits
|
||||
static VSI_INLINE_API uint32_t type_get_bits
|
||||
(
|
||||
const vsi_nn_type_e type
|
||||
)
|
||||
|
|
@ -147,7 +147,7 @@ static inline uint32_t type_get_bits
|
|||
}
|
||||
} /* type_get_bits() */
|
||||
|
||||
static inline void type_get_range
|
||||
static VSI_INLINE_API void type_get_range
|
||||
(
|
||||
vsi_nn_type_e type,
|
||||
double * max_range,
|
||||
|
|
@ -186,7 +186,24 @@ static inline void type_get_range
|
|||
}
|
||||
} /* type_get_range() */
|
||||
|
||||
static inline int32_t fp32_to_affine
|
||||
static VSI_INLINE_API vsi_bool fp32_is_inf
|
||||
(
|
||||
float val
|
||||
)
|
||||
{
|
||||
uint32_t u_value = *(uint32_t*)&val;
|
||||
|
||||
if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
|
||||
{
|
||||
return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
static VSI_INLINE_API int32_t fp32_to_affine
|
||||
(
|
||||
const float in,
|
||||
const float scale,
|
||||
|
|
@ -200,10 +217,17 @@ static inline int32_t fp32_to_affine
|
|||
type_get_range( type, &max_range, &min_range );
|
||||
data = (int32_t)(vsi_rint( in / scale ) + zero_point );
|
||||
data = vsi_nn_max( (int32_t)min_range, vsi_nn_min( (int32_t)max_range , data ) );
|
||||
|
||||
if (fp32_is_inf(in) != 0)
|
||||
{
|
||||
uint32_t sign = (*(uint32_t*)&in) >> 31;
|
||||
data = sign == 1 ? (int32_t)min_range : (int32_t)max_range;
|
||||
}
|
||||
|
||||
return data;
|
||||
} /* fp32_to_affine() */
|
||||
|
||||
static inline float affine_to_fp32
|
||||
static VSI_INLINE_API float affine_to_fp32
|
||||
(
|
||||
const int32_t val,
|
||||
const float scale,
|
||||
|
|
@ -216,7 +240,7 @@ static inline float affine_to_fp32
|
|||
return data;
|
||||
} /* affine_to_fp32() */
|
||||
|
||||
static inline int32_t fp32_to_dfp
|
||||
static VSI_INLINE_API int32_t fp32_to_dfp
|
||||
(
|
||||
const float in,
|
||||
const int8_t fl,
|
||||
|
|
@ -237,10 +261,17 @@ static inline int32_t fp32_to_dfp
|
|||
}
|
||||
data = vsi_nn_min( data, (int32_t)max_range );
|
||||
data = vsi_nn_max( data, (int32_t)min_range );
|
||||
|
||||
if (fp32_is_inf(in) != 0)
|
||||
{
|
||||
uint32_t sign = (*(uint32_t*)&in) >> 31;
|
||||
data = sign == 1 ? (int32_t)min_range : (int32_t) max_range;
|
||||
}
|
||||
|
||||
return data;
|
||||
} /* fp32_to_dfp() */
|
||||
|
||||
static inline float dfp_to_fp32
|
||||
static VSI_INLINE_API float dfp_to_fp32
|
||||
(
|
||||
const int32_t val,
|
||||
const int8_t fl,
|
||||
|
|
@ -259,7 +290,7 @@ static inline float dfp_to_fp32
|
|||
return result;
|
||||
} /* dfp_to_fp32() */
|
||||
|
||||
static inline vsi_status integer_convert
|
||||
static VSI_INLINE_API vsi_status integer_convert
|
||||
(
|
||||
const void * src,
|
||||
vsi_nn_type_e src_type,
|
||||
|
|
@ -303,7 +334,7 @@ typedef union
|
|||
float f;
|
||||
} _fp32_t;
|
||||
|
||||
static inline float fp16_to_fp32
|
||||
static VSI_INLINE_API float fp16_to_fp32
|
||||
(
|
||||
int16_t in
|
||||
)
|
||||
|
|
@ -323,7 +354,7 @@ static inline float fp16_to_fp32
|
|||
return o.f;
|
||||
} /* fp16_to_fp32() */
|
||||
|
||||
static inline float bfp16_to_fp32
|
||||
static VSI_INLINE_API float bfp16_to_fp32
|
||||
(
|
||||
int16_t in
|
||||
)
|
||||
|
|
@ -344,7 +375,7 @@ static inline float bfp16_to_fp32
|
|||
return t3 == 0 ? 0 : out;
|
||||
} /* bfp16_to_fp32() */
|
||||
|
||||
static inline uint16_t fp32_to_fp16
|
||||
static VSI_INLINE_API uint16_t fp32_to_fp16
|
||||
(
|
||||
float in
|
||||
)
|
||||
|
|
@ -370,7 +401,7 @@ static inline uint16_t fp32_to_fp16
|
|||
return (uint16_t) fp16;
|
||||
} /* fp32_to_fp16() */
|
||||
|
||||
static inline uint16_t fp32_to_bfp16
|
||||
static VSI_INLINE_API uint16_t fp32_to_bfp16
|
||||
(
|
||||
float in
|
||||
)
|
||||
|
|
@ -381,7 +412,7 @@ static inline uint16_t fp32_to_bfp16
|
|||
return (uint16_t) t1;
|
||||
} /* fp32_to_bfp16() */
|
||||
|
||||
static inline uint16_t fp32_to_bfp16_rtne
|
||||
static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
|
||||
(
|
||||
float in
|
||||
)
|
||||
|
|
@ -409,7 +440,7 @@ static inline uint16_t fp32_to_bfp16_rtne
|
|||
return out;
|
||||
} /* fp32_to_bfp16_rtne */
|
||||
|
||||
static inline vsi_status dtype_to_float32
|
||||
static VSI_INLINE_API vsi_status dtype_to_float32
|
||||
(
|
||||
uint8_t *src,
|
||||
float *dst,
|
||||
|
|
@ -461,7 +492,7 @@ static inline vsi_status dtype_to_float32
|
|||
return VSI_SUCCESS;
|
||||
}
|
||||
|
||||
static inline vsi_status float32_to_dtype
|
||||
static VSI_INLINE_API vsi_status float32_to_dtype
|
||||
(
|
||||
float src,
|
||||
uint8_t *dst,
|
||||
|
|
|
|||
|
|
@ -42,6 +42,8 @@ extern "C" {
|
|||
#define vsi_clamp(x, min, max) vsi_nn_clamp(x, min, max)
|
||||
#define vsi_rtne(x) vsi_rint(x)
|
||||
|
||||
#define VSI_NN_INT32_MAX (0x7FFFFFFF)
|
||||
|
||||
#define VSI_NN_FLOAT32_INF (0x7F800000)
|
||||
#define VSI_NN_FLOAT32_NAN (0x7FC00000)
|
||||
#define VSI_NN_FLOAT64_INF (0x7FF0000000000000)
|
||||
|
|
@ -53,14 +55,14 @@ extern "C" {
|
|||
size_t size; \
|
||||
TYPE data[0]; \
|
||||
} vsi_##NAME##_array_t; \
|
||||
static inline vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
|
||||
static VSI_INLINE_API vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
|
||||
vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \
|
||||
sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
|
||||
if (array == NULL) return NULL; \
|
||||
array->size = size; \
|
||||
return array; \
|
||||
} \
|
||||
static inline void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \
|
||||
static VSI_INLINE_API void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \
|
||||
{ \
|
||||
if( array && *array ) { \
|
||||
free( *array ); \
|
||||
|
|
@ -167,7 +169,7 @@ void vsi_nn_random_uniform_transform
|
|||
uint32_t len
|
||||
);
|
||||
|
||||
static inline double copy_sign
|
||||
static VSI_INLINE_API double copy_sign
|
||||
(
|
||||
double number,
|
||||
double sign
|
||||
|
|
@ -177,7 +179,7 @@ static inline double copy_sign
|
|||
return (sign > 0) ? value : (-value);
|
||||
} /* copy_sign() */
|
||||
|
||||
static inline float simple_round
|
||||
static VSI_INLINE_API float simple_round
|
||||
(
|
||||
float x
|
||||
)
|
||||
|
|
@ -185,7 +187,7 @@ static inline float simple_round
|
|||
return (float) copy_sign(floorf(fabsf(x) + 0.5f), x);
|
||||
} /* simple_round() */
|
||||
|
||||
static inline double vsi_rint
|
||||
static VSI_INLINE_API double vsi_rint
|
||||
(
|
||||
double x
|
||||
)
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ extern "C" {
|
|||
#define VSI_NN_DO_JOIN(X, Y) VSI_NN_DO_JOIN2(X,Y)
|
||||
#define VSI_NN_DO_JOIN2(X, Y) X##Y
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
|
||||
#define VSI_NN_DEPRECATED(symbol, hints) \
|
||||
__declspec(deprecated(VSI_NN_STRINGIZE(hints))) symbol
|
||||
|
||||
|
|
@ -381,7 +381,7 @@ int32_t vsi_nn_partition
|
|||
* @param[in] num Number of tensors.
|
||||
* @param[out] out_tensors Ordered tensors
|
||||
* */
|
||||
static inline void vsi_nn_reorder_tensor
|
||||
static VSI_INLINE_API void vsi_nn_reorder_tensor
|
||||
(
|
||||
vsi_nn_tensor_t** tensors,
|
||||
const int32_t* order,
|
||||
|
|
@ -417,6 +417,15 @@ vsi_bool vsi_nn_is_broadcast_operaton
|
|||
vsi_nn_tensor_t * output
|
||||
);
|
||||
|
||||
vsi_bool vsi_nn_is_broadcast_axes_operaton
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t * output,
|
||||
int32_t * axis,
|
||||
int32_t axis_num
|
||||
);
|
||||
|
||||
float vsi_nn_get_tensor_scale
|
||||
(
|
||||
vsi_nn_tensor_t * tensor
|
||||
|
|
|
|||
|
|
@ -66,6 +66,8 @@ typedef struct _vsi_nn_hw_config_t
|
|||
uint32_t use_40bits_va;
|
||||
uint32_t support_stream_processor;
|
||||
uint32_t sp_exec_count;
|
||||
uint32_t sp_vector_depth;
|
||||
uint32_t sp_per_core_vector_depth;
|
||||
} vsi_nn_hw_config_t;
|
||||
|
||||
typedef struct _vsi_nn_runtime_option_t
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@
|
|||
struct f##_t_{ ~f##_t_(void) { f(); }}; static f##_t_ f##_; \
|
||||
static void f(void)
|
||||
|
||||
#elif defined(_MSC_VER)
|
||||
#elif (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
|
||||
#pragma section(".CRT$XCU", read)
|
||||
#define _INITIALIZER2(f, p) \
|
||||
static void f(void); \
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@
|
|||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
|
||||
static inline vsi_bool vsi_nn_feature_conv_max_kernel_size()
|
||||
static VSI_INLINE_API vsi_bool vsi_nn_feature_conv_max_kernel_size()
|
||||
{
|
||||
return 11;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@
|
|||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
|
||||
#define snprintf(buffer, count, format, ...) \
|
||||
_snprintf_s(buffer, count, _TRUNCATE, format, ##__VA_ARGS__)
|
||||
#define vsnprintf(buffer, count, format, args) \
|
||||
|
|
|
|||
|
|
@ -190,6 +190,12 @@
|
|||
#include "ops/vsi_nn_op_gather_elements.h"
|
||||
#include "ops/vsi_nn_op_selu.h"
|
||||
#include "ops/vsi_nn_op_celu.h"
|
||||
#include "ops/vsi_nn_op_max_pool3d.h"
|
||||
#include "ops/vsi_nn_op_rcp.h"
|
||||
#include "ops/vsi_nn_op_sign.h"
|
||||
#include "ops/vsi_nn_op_softsign.h"
|
||||
#include "ops/vsi_nn_op_cumsum.h"
|
||||
#include "ops/vsi_nn_op_mod.h"
|
||||
/* custom node head define define */
|
||||
#include "custom/vsi_nn_custom_node_type.h"
|
||||
|
||||
|
|
@ -365,6 +371,12 @@ typedef union _vsi_nn_nn_param
|
|||
vsi_nn_gather_elements_param gather_elements;
|
||||
vsi_nn_selu_param selu;
|
||||
vsi_nn_celu_param celu;
|
||||
vsi_nn_max_pool3d_param max_pool3d;
|
||||
vsi_nn_rcp_param rcp;
|
||||
vsi_nn_sign_param sign;
|
||||
vsi_nn_softsign_param softsign;
|
||||
vsi_nn_cumsum_param cumsum;
|
||||
vsi_nn_mod_param mod;
|
||||
void* client_param;
|
||||
|
||||
/* custom node data struct define */
|
||||
|
|
|
|||
|
|
@ -243,6 +243,18 @@ OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
|
|||
uint32_t enable_nodes_count
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
uint32_t enabled_crop_input_idx,
|
||||
uint32_t start_x,
|
||||
uint32_t start_y,
|
||||
uint32_t crop_w,
|
||||
uint32_t crop_h,
|
||||
uint32_t dst_w,
|
||||
uint32_t dst_h
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@
|
|||
#define _VSI_NN_PUB_H
|
||||
|
||||
#if !defined(OVXLIB_API)
|
||||
#if defined(_WIN32)
|
||||
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
|
||||
#define OVXLIB_API __declspec(dllimport)
|
||||
#else
|
||||
#define OVXLIB_API __attribute__((visibility("default")))
|
||||
|
|
|
|||
|
|
@ -33,11 +33,13 @@
|
|||
extern "C"{
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#define inline __inline
|
||||
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
|
||||
#define VSI_INLINE_API __inline
|
||||
#else
|
||||
#define VSI_INLINE_API inline
|
||||
#endif
|
||||
|
||||
#if (defined(_MSC_VER) || defined(__MINGW32))
|
||||
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
|
||||
#define SIZE_T_SPECIFIER "Iu"
|
||||
#define SSIZE_T_SPECIFIER "Id"
|
||||
#ifdef VSI_40BIT_VA_SUPPORT
|
||||
|
|
@ -59,7 +61,7 @@ extern "C"{
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#if (defined(_MSC_VER))
|
||||
#include <BaseTsd.h>
|
||||
typedef SSIZE_T ssize_t;
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ extern "C"{
|
|||
|
||||
#define VSI_NN_VERSION_MAJOR 1
|
||||
#define VSI_NN_VERSION_MINOR 1
|
||||
#define VSI_NN_VERSION_PATCH 43
|
||||
#define VSI_NN_VERSION_PATCH 50
|
||||
#define VSI_NN_VERSION \
|
||||
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
||||
|
||||
|
|
|
|||
|
|
@ -188,7 +188,7 @@ static vsi_status _query_kernel
|
|||
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (input_dtype == I8)
|
||||
if (input_dtype == I8 || input_dtype == I16)
|
||||
{
|
||||
input_dtype = I32;
|
||||
}
|
||||
|
|
@ -269,7 +269,6 @@ static vsi_nn_kernel_node_t _setup
|
|||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
|
||||
CHECK_STATUS_FAIL_GOTO( status, OnError );
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -285,4 +284,3 @@ OnError:
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( argmax, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -188,6 +188,11 @@ static vsi_status _query_kernel
|
|||
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (input_dtype == I8 || input_dtype == I16)
|
||||
{
|
||||
input_dtype = I32;
|
||||
}
|
||||
|
||||
if (output_dtype == I16)
|
||||
{
|
||||
output_dtype = I32;
|
||||
|
|
@ -264,7 +269,6 @@ static vsi_nn_kernel_node_t _setup
|
|||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
|
||||
CHECK_STATUS_FAIL_GOTO( status, OnError );
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,365 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
|
||||
#define KERNEL_SOURCE_1 "cumsum"
|
||||
#define KERNEL_SOURCE_2 "cumsum_2d"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
|
||||
((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
|
||||
|
||||
#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \
|
||||
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
|
||||
KERNEL_SOURCE_1 },
|
||||
|
||||
#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \
|
||||
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
|
||||
CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
|
||||
KERNEL_SOURCE_2 },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} cumsum_map[] =
|
||||
{
|
||||
HASH_CUMSUM_KERNELS(0, U8, U8)
|
||||
HASH_CUMSUM_KERNELS(0, F32, F32)
|
||||
HASH_CUMSUM_KERNELS(1, U8, U8)
|
||||
HASH_CUMSUM_KERNELS(1, F32, F32)
|
||||
HASH_CUMSUM_KERNELS(2, U8, U8)
|
||||
HASH_CUMSUM_KERNELS(2, F32, F32)
|
||||
HASH_CUMSUM_KERNELS_2D(0, U8, U8)
|
||||
HASH_CUMSUM_KERNELS_2D(0, F32, F32)
|
||||
HASH_CUMSUM_KERNELS_2D(1, U8, U8)
|
||||
HASH_CUMSUM_KERNELS_2D(1, F32, F32)
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _cumsum_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _CUMSUM_PARAM_NUM _cnt_of_array( _cumsum_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_cumsum_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_size_array_t * input_shape = NULL;
|
||||
int32_t axis = 0;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
int32_t channel = 0;
|
||||
int32_t w = 1;
|
||||
int32_t h = 1;
|
||||
int32_t c = 1;
|
||||
uint32_t dim = 1;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
dim = (uint32_t)input_shape->size;
|
||||
width = (int32_t)(input_shape->data[0]);
|
||||
height = (int32_t)(input_shape->data[1]);
|
||||
channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
|
||||
|
||||
if (axis == 0)
|
||||
{
|
||||
w = 1;
|
||||
h = height;
|
||||
c = channel;
|
||||
}
|
||||
else if (axis == 1)
|
||||
{
|
||||
w = width;
|
||||
h = 1;
|
||||
c = channel;
|
||||
}
|
||||
else if (axis == 2)
|
||||
{
|
||||
w = width;
|
||||
h = height;
|
||||
c = 1;
|
||||
}
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = w;
|
||||
gpu_param.global_size[1] = h;
|
||||
gpu_param.global_size[2] = c;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _cumsum_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t axis,
|
||||
int32_t is_2d
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
uint32_t key = 0;
|
||||
int i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (input0_dtype == U32)
|
||||
{
|
||||
input0_dtype = U8;
|
||||
}
|
||||
|
||||
if (input0_dtype == F16)
|
||||
{
|
||||
input0_dtype = F32;
|
||||
}
|
||||
|
||||
if (output_dtype == U32)
|
||||
{
|
||||
output_dtype = U8;
|
||||
}
|
||||
|
||||
if (output_dtype == F16)
|
||||
{
|
||||
output_dtype = F32;
|
||||
}
|
||||
|
||||
key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
|
||||
{
|
||||
if ( cumsum_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < _cnt_of_array(cumsum_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", cumsum_map[i].function_name );
|
||||
kernel->info.parameters = _cumsum_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def );
|
||||
kernel->info.initialize = _cumsum_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
cumsum_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
cumsum_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" );
|
||||
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
|
||||
int32_t axis_new = 0;
|
||||
int32_t is_2d = 0;
|
||||
uint32_t rs_dim = 2;
|
||||
int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float in_out_scale = input_scale * output_scale;
|
||||
float in_out_zp_scale = in_out_scale * input_zp;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
int32_t channel = 1;
|
||||
int32_t i = 0;
|
||||
|
||||
vsi_nn_kernel_optimize_softmax_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rs_dim, &axis_new);
|
||||
if (rs_dim > 3)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
width = (int32_t)shapes[0][0];
|
||||
height = (int32_t)shapes[0][1];
|
||||
|
||||
if (rs_dim == 2)
|
||||
{
|
||||
is_2d = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
channel = (int32_t)shapes[0][2];
|
||||
}
|
||||
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], (vsi_size_t)rs_dim );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[0], (vsi_size_t)rs_dim );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 2;
|
||||
|
||||
/* Pass parameters to node. */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CUMSUM_PARAM_NUM,
|
||||
reshape_tensors, 1, &reshape_tensors[1], 1 );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &channel );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_zp_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CUMSUM_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( cumsum, _setup )
|
||||
|
|
@ -53,6 +53,9 @@ typedef enum
|
|||
UNARY_HGELU,
|
||||
UNARY_SELU,
|
||||
UNARY_CELU,
|
||||
UNARY_RCP,
|
||||
UNARY_SIGN,
|
||||
UNARY_SOFTSIGN,
|
||||
} unary_type_e;
|
||||
|
||||
/*
|
||||
|
|
@ -94,6 +97,13 @@ typedef enum
|
|||
#define HGELU_OPERATION hard_gelu
|
||||
#define SELU_OPERATION selu
|
||||
#define CELU_OPERATION celu
|
||||
#define RCP_OPERATION rcp
|
||||
#define SIGN_OPERATION sign
|
||||
#define SOFTSIGN_OPERATION softsign
|
||||
|
||||
#define ADD_UNARY_SH_KERNELS(name, src_type, dst_type) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, src_type, dst_type) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, src_type, dst_type)
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
|
|
@ -101,61 +111,39 @@ static const struct {
|
|||
const char* source_name;
|
||||
} kernel_map[] =
|
||||
{
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION, UNARY_HGELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(SIN, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(COS, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(EXP, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(LOG, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(NEG, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(HSIGMOID, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(MISH, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(ROUND, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(GELU, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(HGELU, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(SELU, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(CELU, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(RCP, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(SIGN, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(SOFTSIGN, F32, F32)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F32, F32)
|
||||
ADD_UNARY_SH_KERNELS(SIN, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(COS, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(EXP, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(LOG, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(NEG, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(HSIGMOID, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(MISH, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(ROUND, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(GELU, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(HGELU, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(SELU, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(CELU, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(RCP, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(SIGN, U8, U8)
|
||||
ADD_UNARY_SH_KERNELS(SOFTSIGN, U8, U8)
|
||||
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION, UNARY_HGELU, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8, U8)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8, U8)
|
||||
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32)
|
||||
ADD_UNARY_SH_KERNELS(NEG, I32, I32)
|
||||
};
|
||||
|
||||
#undef SIN_OPERATION
|
||||
|
|
@ -170,6 +158,9 @@ static const struct {
|
|||
#undef HGELU_OPERATION
|
||||
#undef SELU_OPERATION
|
||||
#undef CELU_OPERATION
|
||||
#undef RCP_OPERATION
|
||||
#undef SIGN_OPERATION
|
||||
#undef SOFTSIGN_OPERATION
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
|
|
@ -458,4 +449,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu, UNARY_GELU )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu, UNARY_HGELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( selu, UNARY_SELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( celu, UNARY_CELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp, UNARY_RCP )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( sign, UNARY_SIGN )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( softsign, UNARY_SOFTSIGN )
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -123,7 +123,7 @@ static vsi_status cal_gather_tensor_reshape_size
|
|||
uint32_t i = 0;
|
||||
vsi_size_t elementCnt = 1;
|
||||
vsi_size_t outerCnt = 1;
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
for (i = 0; i < dims_num - batch_dims; ++i)
|
||||
{
|
||||
|
|
@ -365,4 +365,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( gather, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -111,7 +111,7 @@ static vsi_status cal_gather_nd_tensor_reshape_size
|
|||
vsi_size_t *input_size = inputs[0]->attr.size;
|
||||
uint32_t i = 0;
|
||||
vsi_size_t elementCnt = 1;
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
newDim[0] = 0;
|
||||
for(i = 0; i < dims_num; ++i)
|
||||
|
|
@ -336,4 +336,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( gather_nd, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,6 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -44,21 +43,20 @@ __BEGIN_DECLS
|
|||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_MEAN_VARI,
|
||||
INTERNAL_KERNEL_SUMS,
|
||||
INTERNAL_KERNEL_NORM,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define KERNEL_SOURCE_1 "instance_normalization_u8"
|
||||
#define KERNEL_SOURCE_2 "instance_normalization_f16"
|
||||
#define KERNEL_SOURCE_2 "instance_normalization_f32"
|
||||
#define KERNEL_SOURCE_3 "instance_normalization_i32"
|
||||
#define KERNEL_SOURCE_4 "instance_normalization_f32"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE)
|
||||
#define HASH_INSTANCENORM_SUMS_KERNEL_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE)
|
||||
|
||||
#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE"_2D")
|
||||
#define HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE"_2D")
|
||||
|
||||
#define HASH_INSTANCENORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.instance_norm_"#SRC0_TYPE"to"#DST_TYPE)
|
||||
|
|
@ -68,17 +66,17 @@ typedef enum
|
|||
|
||||
// Add kernel hashtable here
|
||||
// mean vari
|
||||
#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \
|
||||
#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
|
||||
|
||||
#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 0), \
|
||||
HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(IN0_TYPE), \
|
||||
#define TENSOR_INSTANCENORM_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \
|
||||
HASH_INSTANCENORM_SUMS_KERNEL_NAME(IN0_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 1), \
|
||||
HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(IN0_TYPE), \
|
||||
#define TENSOR_INSTANCENORM_SUMS_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 1), \
|
||||
HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(IN0_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
// normalization
|
||||
|
|
@ -102,17 +100,15 @@ typedef struct
|
|||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] =
|
||||
static const _kernel_map_type _instancenorm_sums_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( U8, F32, KERNEL_SOURCE_1 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I32, F32, KERNEL_SOURCE_3 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F32, F32, KERNEL_SOURCE_4 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 )
|
||||
TENSOR_INSTANCENORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_1 )
|
||||
TENSOR_INSTANCENORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
|
||||
TENSOR_INSTANCENORM_SUMS_KERNELS( F32, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_SUMS_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_SUMS_KERNELS( I32, F32, KERNEL_SOURCE_3 )
|
||||
TENSOR_INSTANCENORM_SUMS_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
|
||||
};
|
||||
|
||||
static const _kernel_map_type _instancenorm_kernel_map[] =
|
||||
|
|
@ -123,22 +119,19 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
|
|||
TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_1 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_1 )
|
||||
|
||||
TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
|
||||
|
||||
TENSOR_INSTANCENORM_KERNELS( I32, I32, KERNEL_SOURCE_3 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 )
|
||||
TENSOR_INSTANCENORM_KERNELS( I32, F32, KERNEL_SOURCE_3 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
|
||||
|
||||
TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_4 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 )
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
|
||||
static vx_param_description_t _instancenorm_sums_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
|
|
@ -146,12 +139,9 @@ static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _INSTANCENORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _instancenorm_mean_vari_kernel_param_def )
|
||||
#define _INSTANCENORM_SUMS_PARAM_NUM _cnt_of_array( _instancenorm_sums_kernel_param_def )
|
||||
|
||||
static vx_param_description_t _instancenorm_kernel_param_def[] =
|
||||
{
|
||||
|
|
@ -168,10 +158,6 @@ static vx_param_description_t _instancenorm_kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _INSTANCENORM_PARAM_NUM _cnt_of_array( _instancenorm_kernel_param_def )
|
||||
|
|
@ -179,7 +165,7 @@ static vx_param_description_t _instancenorm_kernel_param_def[] =
|
|||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
|
||||
DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
|
|
@ -244,7 +230,7 @@ final:
|
|||
attr[1] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _instance_normalization_mean_vari_initializer() */
|
||||
} /* _instance_normalization_sums_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
||||
(
|
||||
|
|
@ -334,12 +320,12 @@ static vsi_status _query_kernel
|
|||
|
||||
switch( kernel_id )
|
||||
{
|
||||
case INTERNAL_KERNEL_MEAN_VARI:
|
||||
initializer = _instancenorm_mean_vari_initializer;
|
||||
kernel_map = _instancenorm_mean_vari_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _instancenorm_mean_vari_kernel_map );
|
||||
param_def = _instancenorm_mean_vari_kernel_param_def;
|
||||
param_size = _INSTANCENORM_MEAN_VARI_PARAM_NUM;
|
||||
case INTERNAL_KERNEL_SUMS:
|
||||
initializer = _instancenorm_sums_initializer;
|
||||
kernel_map = _instancenorm_sums_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map );
|
||||
param_def = _instancenorm_sums_kernel_param_def;
|
||||
param_size = _INSTANCENORM_SUMS_PARAM_NUM;
|
||||
break;
|
||||
case INTERNAL_KERNEL_NORM:
|
||||
initializer = _instancenorm_initializer;
|
||||
|
|
@ -392,9 +378,9 @@ static vsi_nn_kernel_node_t _setup
|
|||
)
|
||||
{
|
||||
#define INTERNAL_KERNEL_SIZE (1)
|
||||
#define MEAN_VARI_INDEX (0)
|
||||
#define SUMS_INDEX (0)
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t mean_vari_node_params[_INSTANCENORM_MEAN_VARI_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_dtype_e in0_dtype = U8;
|
||||
|
|
@ -407,18 +393,17 @@ static vsi_nn_kernel_node_t _setup
|
|||
uint32_t hashkey = 0;
|
||||
int32_t i = 0;
|
||||
uint32_t rank = outputs[0]->attr.dim_num;
|
||||
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
|
||||
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float eps = vsi_nn_kernel_param_get_float32( params, "eps" ) /
|
||||
(input_scale * input_scale);
|
||||
size_t width = inputs[0]->attr.size[0];
|
||||
size_t height = inputs[0]->attr.size[1];
|
||||
int32_t reshape_flg = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
|
||||
&& rank > 2;
|
||||
int32_t group_num = (int32_t)(width + 15) / 16;
|
||||
int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
int32_t output_zp = vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float in_fl_scale = 1.0f, out_fl_scale = 1.0;
|
||||
float dim_ratio = (float)1.0 / (float)(width * height);
|
||||
float inv_multiplier = (float)1.0 / (float)(width * height);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
|
|
@ -443,15 +428,21 @@ static vsi_nn_kernel_node_t _setup
|
|||
attr.size[2] = 1;
|
||||
attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
attr.dim_num = 4;
|
||||
tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
|
||||
tensors[SUMS_INDEX] = vsi_nn_CreateTensor( graph, &attr );
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
|
||||
in0_dtype = in0_dtype == I8 ? I32 : in0_dtype;
|
||||
in0_dtype = in0_dtype == I16 ? I32 : in0_dtype;
|
||||
out_dtype = out_dtype == F16 ? F32 : out_dtype;
|
||||
out_dtype = out_dtype == I8 ? I32 : out_dtype;
|
||||
out_dtype = out_dtype == I16 ? I32 : out_dtype;
|
||||
|
||||
hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg );
|
||||
hashkeys[SUMS_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg );
|
||||
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
|
||||
|
||||
status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
|
||||
status = _query_kernel( ikernels[SUMS_INDEX], hashkeys[SUMS_INDEX], INTERNAL_KERNEL_SUMS );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
|
|
@ -497,37 +488,31 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
// Mean Vari
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
|
||||
node = vsi_nn_kernel_create_node( graph, ikernels[SUMS_INDEX] );
|
||||
if (node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
if (reshape_flg)
|
||||
{
|
||||
mean_vari_node_params[index++] = rs_input;
|
||||
sums_node_params[index++] = rs_input;
|
||||
}
|
||||
else
|
||||
{
|
||||
mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
|
||||
sums_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
|
||||
}
|
||||
mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale );
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
sums_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t;
|
||||
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
|
||||
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( node, mean_vari_node_params,
|
||||
_INSTANCENORM_MEAN_VARI_PARAM_NUM );
|
||||
status = vsi_nn_kernel_node_pass_param( node, sums_node_params,
|
||||
_INSTANCENORM_SUMS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &sums_node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &sums_node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &sums_node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &sums_node_params[5] );
|
||||
vsi_nn_kernel_node_release( &node );
|
||||
}
|
||||
}
|
||||
|
|
@ -562,7 +547,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
|
||||
}
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t;
|
||||
if (reshape_flg)
|
||||
{
|
||||
node_params[index++] = rs_output;
|
||||
|
|
@ -573,15 +558,11 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &out_fl_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inv_multiplier );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_num );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params,
|
||||
|
|
@ -595,10 +576,6 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[12] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[13] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[14] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[15] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[16] );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,312 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
|
||||
#define KERNEL_SOURCE_1 "maxpoolwithargmax"
|
||||
#define KERNEL_SOURCE_2 "maxpoolwithargmax_2d"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define MAXPOOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, _image_2d) \
|
||||
(( IN_DTYPE << 24 ) | ( OUT_DTYPE0 << 20) | ( OUT_DTYPE1 << 12) | (_image_2d))
|
||||
|
||||
#define HASH_MAXPOOLWITHARGMAX_KERNELS( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \
|
||||
{ MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 0), \
|
||||
CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1), \
|
||||
KERNEL_SOURCE_1 },
|
||||
|
||||
#define HASH_MAXPOOLWITHARGMAX_KERNELS_2D( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \
|
||||
{ MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 1), \
|
||||
CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1"_2D"), \
|
||||
KERNEL_SOURCE_2 },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} maxpoolwithargmax_map[] =
|
||||
{
|
||||
HASH_MAXPOOLWITHARGMAX_KERNELS(F32, F32, I32)
|
||||
HASH_MAXPOOLWITHARGMAX_KERNELS(BF16, BF16, I32)
|
||||
HASH_MAXPOOLWITHARGMAX_KERNELS(U32, U32, I32)
|
||||
HASH_MAXPOOLWITHARGMAX_KERNELS(I32, I32, I32)
|
||||
HASH_MAXPOOLWITHARGMAX_KERNELS_2D(F32, F32, I32)
|
||||
HASH_MAXPOOLWITHARGMAX_KERNELS_2D(BF16, BF16, I32)
|
||||
HASH_MAXPOOLWITHARGMAX_KERNELS_2D(U32, U32, I32)
|
||||
HASH_MAXPOOLWITHARGMAX_KERNELS_2D(I32, I32, I32)
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _MAXPOOLWITHARGMAX_PARAM_NUM _cnt_of_array( _maxpoolwithargmax_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_maxpoolwithargmax_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vx_status status = VX_FAILURE;
|
||||
vx_tensor output = (vx_tensor)param[1];
|
||||
vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
|
||||
CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
out_shape = attr_out->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = out_shape->data[1];
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
final:
|
||||
if (attr_out)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release(&attr_out);
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _maxpoolwithargmax_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t is_2d
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e input_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output1_dtype = I32;
|
||||
uint32_t key = 0;
|
||||
int32_t i = 0;
|
||||
|
||||
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
output1_dtype = vsi_nn_kernel_map_dtype( outputs[1]->attr.dtype.vx_type );
|
||||
|
||||
if (input_dtype == U8)
|
||||
{
|
||||
input_dtype = U32;
|
||||
}
|
||||
|
||||
if (input_dtype == I8 || input_dtype == I16)
|
||||
{
|
||||
input_dtype = I32;
|
||||
}
|
||||
|
||||
if (input_dtype == F16)
|
||||
{
|
||||
input_dtype = F32;
|
||||
}
|
||||
|
||||
if (output0_dtype == U8)
|
||||
{
|
||||
output0_dtype = U32;
|
||||
}
|
||||
|
||||
if (output0_dtype == I8 || output0_dtype == I16)
|
||||
{
|
||||
output0_dtype = I32;
|
||||
}
|
||||
|
||||
if (output0_dtype == F16)
|
||||
{
|
||||
output0_dtype = F32;
|
||||
}
|
||||
|
||||
key = MAXPOOLWITHARGMAX_HASH_KEY( input_dtype, output0_dtype, output1_dtype, is_2d);
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(maxpoolwithargmax_map); i ++ )
|
||||
{
|
||||
if ( maxpoolwithargmax_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < _cnt_of_array(maxpoolwithargmax_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", maxpoolwithargmax_map[i].function_name );
|
||||
kernel->info.parameters = _maxpoolwithargmax_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _maxpoolwithargmax_kernel_param_def );
|
||||
kernel->info.initialize = _maxpoolwithargmax_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
maxpoolwithargmax_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
maxpoolwithargmax_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x");
|
||||
int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y");
|
||||
int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
|
||||
int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
|
||||
int32_t pad_x = vsi_nn_kernel_param_get_int32(params, "pad_left");
|
||||
int32_t pad_y = vsi_nn_kernel_param_get_int32(params, "pad_top");
|
||||
int32_t image_2d = inputs[0]->attr.dim_num == 2 ? 1 : 0;
|
||||
int32_t width = (int32_t)inputs[0]->attr.size[0];
|
||||
int32_t height = (int32_t)inputs[0]->attr.size[1];
|
||||
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float scale_value = 1.0f;
|
||||
float tail_value = 0.0f;
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num )
|
||||
|| !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num )
|
||||
|| !vsi_nn_kernel_gpu_check_shape( outputs[1]->attr.size,
|
||||
outputs[1]->attr.dim_num ))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
scale_value = inputScale / outputScale;
|
||||
tail_value = outputTail - inputTail * inputScale / outputScale;
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, image_2d );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 3;
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_value );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &tail_value );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[12] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( maxpoolwithargmax, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,303 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define MOD_KERNEL_SOURCE_NAME "mod"
|
||||
|
||||
#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
|
||||
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
|
||||
|
||||
|
||||
#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
|
||||
{ MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \
|
||||
MOD_KERNEL_SOURCE_NAME},
|
||||
|
||||
#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
|
||||
{ MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
|
||||
CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \
|
||||
MOD_KERNEL_SOURCE_NAME },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _mod_kernel_map[] =
|
||||
{
|
||||
|
||||
// Register kernel here
|
||||
MOD_KERNELS( F32, F32, F32 )
|
||||
MOD_KERNELS( I32, I32, I32 )
|
||||
MOD_KERNELS( I32, I32, U8 )
|
||||
MOD_KERNELS( U8, U8, U8 )
|
||||
MOD_KERNELS( U8, I32, U8 )
|
||||
|
||||
MOD_KERNELS_2D( F32, F32, F32 )
|
||||
MOD_KERNELS_2D( I32, I32, I32 )
|
||||
MOD_KERNELS_2D( I32, I32, U8 )
|
||||
MOD_KERNELS_2D( U8, U8, U8 )
|
||||
MOD_KERNELS_2D( U8, I32, U8 )
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _mod_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _MOD_PARAM_NUM _cnt_of_array( _mod_kernel_param_def )
|
||||
#define MOD_QUANT_PARAM_NUM _cnt_of_array( _mod_kernel_param_def )
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_mod_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_tensor output = (vx_tensor)param[2];
|
||||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t *output_shape = NULL;
|
||||
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
output_shape = output_attr->shape;
|
||||
|
||||
gpu_param.dim = output_shape->size < 3 ? 2 : 3;
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
if (output_attr)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release(&output_attr);
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _mod_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool image_2d
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _mod_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _mod_kernel_map );
|
||||
vx_param_description_t * param_def = _mod_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _mod_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _mod_initializer;
|
||||
|
||||
uint32_t key = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (F16 == in0_dtype)
|
||||
{
|
||||
in0_dtype = F32;
|
||||
}
|
||||
else if (I16 == in0_dtype || I8 == in0_dtype)
|
||||
{
|
||||
in0_dtype = I32;
|
||||
}
|
||||
|
||||
if (F16 == in1_dtype)
|
||||
{
|
||||
in1_dtype = F32;
|
||||
}
|
||||
else if (I16 == in1_dtype || I8 == in1_dtype)
|
||||
{
|
||||
in1_dtype = I32;
|
||||
}
|
||||
|
||||
if (F16 == out_dtype)
|
||||
{
|
||||
out_dtype = F32;
|
||||
}
|
||||
else if (I16 == out_dtype || I8 == out_dtype)
|
||||
{
|
||||
out_dtype = I32;
|
||||
}
|
||||
|
||||
key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_bool image_2d = FALSE;
|
||||
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
|
||||
int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
|
||||
|
||||
outputScale = 1.0f / outputScale;
|
||||
input0Tail = -(input0Tail * input0Scale);
|
||||
input1Tail = -(input1Tail * input1Scale);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = (outputs[0]->attr.dim_num == 2);
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, image_2d);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
size_t node_params_num = MOD_QUANT_PARAM_NUM;
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
|
||||
node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
|
||||
node_params[5] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
|
||||
node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
|
||||
node_params[7] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
|
||||
node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
|
||||
node_params[9] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( mod, _setup )
|
||||
|
||||
|
|
@ -48,7 +48,7 @@ __BEGIN_DECLS
|
|||
|
||||
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE ) \
|
||||
{ ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, 0 ), \
|
||||
CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
_ROI_ALIGN_KERNEL_SOURCE(IN0_DTYPE) }
|
||||
|
||||
typedef struct
|
||||
|
|
@ -61,6 +61,7 @@ typedef struct
|
|||
static const _kernel_map_type _roi_align_kernel_map[] =
|
||||
{
|
||||
PACK_KERNEL_MAP(F32, F32, I32, F32),
|
||||
PACK_KERNEL_MAP(U8, U16, I32, U8),
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -82,20 +83,28 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _ROI_ALIGN_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def )
|
||||
|
||||
#define SCALAR_SPATIAL_X_SCALE (4)
|
||||
#define SCALAR_SPATIAL_Y_SCALE (5)
|
||||
#define SCALAR_INPUT_WIDTH (6)
|
||||
#define SCALAR_INPUT_HEIGHT (7)
|
||||
#define SCALAR_RCP_OF_OUTPUT_WIDTH (8)
|
||||
#define SCALAR_RCP_OF_OUTPUT_HEIGHT (9)
|
||||
#define SCALAR_SAMPLING_X_RATIO (10)
|
||||
#define SCALAR_SAMPLING_Y_RATIO (11)
|
||||
#define SCALAR_DEPTH (12)
|
||||
#define SCALAR_INPUT_SCALE (4)
|
||||
#define SCALAR_INPUT_TAIL (5)
|
||||
#define SCALAR_OUTPUT_SCALE (6)
|
||||
#define SCALAR_OUTPUT_ZP (7)
|
||||
#define SCALAR_SPATIAL_X_SCALE (8)
|
||||
#define SCALAR_SPATIAL_Y_SCALE (9)
|
||||
#define SCALAR_INPUT_WIDTH (10)
|
||||
#define SCALAR_INPUT_HEIGHT (11)
|
||||
#define SCALAR_RCP_OF_OUTPUT_WIDTH (12)
|
||||
#define SCALAR_RCP_OF_OUTPUT_HEIGHT (13)
|
||||
#define SCALAR_SAMPLING_X_RATIO (14)
|
||||
#define SCALAR_SAMPLING_Y_RATIO (15)
|
||||
#define SCALAR_DEPTH (16)
|
||||
|
||||
#define ROI_ALIGN_PARAM_NUM 13
|
||||
#define ROI_ALIGN_PARAM_NUM 17
|
||||
#define ROI_ALIGN_QUANT_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def )
|
||||
|
||||
/*
|
||||
|
|
@ -185,6 +194,7 @@ static vsi_status _query_kernel
|
|||
|
||||
in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
|
||||
in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
|
||||
out_dtype = out_dtype == F16 ? F32 : out_dtype;
|
||||
|
||||
key = ROI_ALIGN_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d );
|
||||
|
||||
|
|
@ -241,8 +251,14 @@ static vsi_nn_kernel_node_t _setup
|
|||
float height_ratio = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
|
||||
int32_t width_sample_num = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
|
||||
int32_t height_sample_num = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
|
||||
float width_scale = 1.0f / width_ratio;
|
||||
float height_scale = 1.0f / height_ratio;
|
||||
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
|
||||
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
|
||||
float input_tail = -(input_zp * input_scale);
|
||||
float roi_scale = vsi_nn_get_tensor_scale(inputs[1]);
|
||||
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
|
||||
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
|
||||
float width_scale = roi_scale / width_ratio;
|
||||
float height_scale = roi_scale / height_ratio;
|
||||
float in_width = (float)(inputs[0]->attr.size[0]);
|
||||
float in_height = (float)(inputs[0]->attr.size[1]);
|
||||
float rcp_of_out_width = 1.0f / (float)(outputs[0]->attr.size[0]);
|
||||
|
|
@ -287,6 +303,10 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
|
||||
reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
|
||||
|
||||
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
|
||||
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &input_tail );
|
||||
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
|
||||
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
|
||||
node_params[SCALAR_SPATIAL_X_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &width_scale );
|
||||
node_params[SCALAR_SPATIAL_Y_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &height_scale );
|
||||
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &in_width );
|
||||
|
|
@ -299,6 +319,10 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_X_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_Y_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
|
|||
return status;
|
||||
}
|
||||
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
newDim[0] = 0;
|
||||
for(i = 0; i < dims_num; ++i)
|
||||
|
|
@ -333,4 +333,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( scatter_nd, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ static vsi_status cal_scatter_nd_update_tensor_reshape_size
|
|||
return status;
|
||||
}
|
||||
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
newDim[0] = 0;
|
||||
for(i = 0; i < dims_num; ++i)
|
||||
|
|
@ -373,4 +373,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( scatter_nd_update, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -22,7 +22,6 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -49,6 +48,13 @@ __BEGIN_DECLS
|
|||
CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
|
||||
_TOPK_KERNEL_SOURCE }
|
||||
|
||||
#define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
|
||||
#define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("cl.topk_odd_even_sort_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
|
||||
"topk_odd_even_sort" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
|
|
@ -84,6 +90,14 @@ static const _kernel_map_type _topk_kernel_map[] =
|
|||
PACK_KERNEL_MAP( I32, I32, 6 ),
|
||||
};
|
||||
|
||||
static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_ODD_EVEN_SORT_KERNEL_MAP( F32, F32 ),
|
||||
PACK_ODD_EVEN_SORT_KERNEL_MAP( U32, U32 ),
|
||||
PACK_ODD_EVEN_SORT_KERNEL_MAP( I32, I32 ),
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
|
|
@ -99,6 +113,19 @@ static vx_param_description_t _topk_kernel_param_def[] =
|
|||
#define _TOPK_PARAM_NUM _cnt_of_array( _topk_kernel_param_def )
|
||||
#define SCALAR_INPUT_NUM_STAGES (3)
|
||||
#define SCALAR_INPUT_WIDTH (4)
|
||||
|
||||
static vx_param_description_t _topk_odd_even_sort_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _TOPK_ODD_EVEN_SORT_PARAM_NUM _cnt_of_array( _topk_odd_even_sort_kernel_param_def )
|
||||
#define SCALAR_INPUT_SIZE (5)
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
|
|
@ -140,9 +167,47 @@ DEF_KERNEL_INITIALIZER(_topk_initializer)
|
|||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(input_attr);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
return status;
|
||||
} /* _topk_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_topk_odd_even_sort_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
2,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||
vsi_size_array_t * in_shape = NULL;
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
in_shape = input_attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.local_size[0] = 32;
|
||||
gpu_param.local_size[1] = 1;
|
||||
gpu_param.global_size[0] = 32;
|
||||
gpu_param.global_size[1] = in_shape->data[1];
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(input_attr);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
return status;
|
||||
} /* _topk_odd_even_sort_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
|
|
@ -215,6 +280,72 @@ static vsi_status _query_kernel
|
|||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_status _query_odd_even_sort_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _topk_odd_even_sort_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _topk_odd_even_sort_kernel_map );
|
||||
vx_param_description_t * param_def = _topk_odd_even_sort_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _topk_odd_even_sort_initializer;
|
||||
#define _PACK_SELECT_KEY( in_type, out_type ) \
|
||||
( (in_type) | (out_type << 8) )
|
||||
uint32_t key = 0;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
|
||||
{
|
||||
case _PACK_SELECT_KEY(F32, F32):
|
||||
case _PACK_SELECT_KEY(F16, F16):
|
||||
key = TOPK_ODD_EVEN_SORT_HASH_KEY( F32, F32 );
|
||||
break;
|
||||
case _PACK_SELECT_KEY(U32, U32):
|
||||
case _PACK_SELECT_KEY(U16, U16):
|
||||
case _PACK_SELECT_KEY(U8, U8):
|
||||
key = TOPK_ODD_EVEN_SORT_HASH_KEY( U32, U32 );
|
||||
break;
|
||||
case _PACK_SELECT_KEY(I32, I32):
|
||||
case _PACK_SELECT_KEY(I16, I16):
|
||||
case _PACK_SELECT_KEY(I8, I8):
|
||||
key = TOPK_ODD_EVEN_SORT_HASH_KEY( I32, I32 );
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#undef _PACK_SELECT_KEY
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _topk_odd_even_sort_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
|
|
@ -228,16 +359,19 @@ static vsi_nn_kernel_node_t _setup
|
|||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
|
||||
vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t block_size = inputs[0]->attr.size[0];
|
||||
vsi_size_t block_num = 1;
|
||||
uint32_t i = 0;
|
||||
vsi_nn_tensor_t* rs_tensors[3] = { NULL };
|
||||
vsi_nn_tensor_t* rs_tensors[5] = { NULL };
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
|
||||
int32_t width = (int32_t)block_size;
|
||||
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
|
||||
int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
|
||||
vsi_bool is_odd_even_sort = FALSE;
|
||||
size_t param_num = _TOPK_PARAM_NUM;
|
||||
|
||||
for (i = 1; i < inputs[0]->attr.dim_num; i ++)
|
||||
{
|
||||
|
|
@ -257,26 +391,58 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shape[0], 2 );
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shape[1], 2 );
|
||||
rs_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[1], shape[1], 2 );
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, num_stages );
|
||||
if (num_stages < 7)
|
||||
{
|
||||
status = _query_kernel( kernel, inputs, outputs, num_stages );
|
||||
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shape[1], 2 );
|
||||
rs_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[1], shape[1], 2 );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = _query_odd_even_sort_kernel( kernel, inputs, outputs );
|
||||
is_odd_even_sort = TRUE;
|
||||
param_num = _TOPK_ODD_EVEN_SORT_PARAM_NUM;
|
||||
|
||||
memcpy( &attr, &(rs_tensors[0]->attr), sizeof(vsi_nn_tensor_attr_t) );
|
||||
rs_tensors[1] = vsi_nn_CreateTensor( graph, &attr );
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
rs_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
|
||||
|
||||
rs_tensors[3] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shape[1], 2 );
|
||||
rs_tensors[4] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[1], shape[1], 2 );
|
||||
|
||||
input_num = 3;
|
||||
}
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
|
||||
rs_tensors, input_num, &rs_tensors[1], output_num );
|
||||
vsi_nn_kernel_node_pack_io( node_params, param_num,
|
||||
rs_tensors, input_num, &rs_tensors[input_num], output_num );
|
||||
/* Pass parameters to node. */
|
||||
node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &num_stages );
|
||||
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &width );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
|
||||
if (is_odd_even_sort)
|
||||
{
|
||||
node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &width );
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &num_stages );
|
||||
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &width );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
}
|
||||
|
|
@ -284,13 +450,25 @@ final:
|
|||
vsi_safe_release_tensor(rs_tensors[0]);
|
||||
vsi_safe_release_tensor(rs_tensors[1]);
|
||||
vsi_safe_release_tensor(rs_tensors[2]);
|
||||
if (node_params[SCALAR_INPUT_NUM_STAGES])
|
||||
vsi_safe_release_tensor(rs_tensors[3]);
|
||||
vsi_safe_release_tensor(rs_tensors[4]);
|
||||
if (is_odd_even_sort)
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
|
||||
if (node_params[SCALAR_INPUT_SIZE])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SIZE] );
|
||||
}
|
||||
}
|
||||
if (node_params[SCALAR_INPUT_WIDTH])
|
||||
else
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
|
||||
if (node_params[SCALAR_INPUT_NUM_STAGES])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
|
||||
}
|
||||
if (node_params[SCALAR_INPUT_WIDTH])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,260 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _CPU_ARG_NUM (3)
|
||||
#define _CPU_INPUT_NUM (1)
|
||||
#define _CPU_OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.cumsum")
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_cumsum_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_FAILURE;
|
||||
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
|
||||
float * buffer[2] = { NULL };
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
|
||||
int32_t i = 0;
|
||||
int32_t axisSize = 1, innerSize = 1, outerSize = 1;
|
||||
int32_t axis = 0, exclusive = 0, reverse = 0;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &exclusive);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
|
||||
|
||||
buffer[1] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
|
||||
memset( buffer[1], 0, out_elements * sizeof(float) );
|
||||
|
||||
{
|
||||
int32_t dims_num = (int32_t)attr[1]->shape->size;
|
||||
int32_t inner = 0;
|
||||
int32_t outer = 0;
|
||||
|
||||
for(i = 0; i < axis; ++i)
|
||||
{
|
||||
innerSize *= (int32_t)attr[0]->shape->data[i];
|
||||
}
|
||||
|
||||
axisSize = (int32_t)attr[0]->shape->data[i++];
|
||||
|
||||
for(; i < dims_num; ++i)
|
||||
{
|
||||
outerSize *= (int32_t)attr[0]->shape->data[i];
|
||||
}
|
||||
|
||||
for ( outer = 0; outer < outerSize; ++outer)
|
||||
{
|
||||
for ( inner = 0; inner < innerSize; ++inner)
|
||||
{
|
||||
float sum = .0f;
|
||||
|
||||
if (exclusive && reverse)
|
||||
{
|
||||
int32_t idx_out = (outer * axisSize + axisSize - 1) * innerSize + inner;
|
||||
buffer[1][idx_out] = sum;
|
||||
for (i = axisSize - 1; i > 0; i--)
|
||||
{
|
||||
int32_t idx = (outer * axisSize + i) * innerSize + inner;
|
||||
float value = buffer[0][idx];
|
||||
idx_out = (outer * axisSize + i - 1) * innerSize + inner;
|
||||
sum += value;
|
||||
buffer[1][idx_out] = sum;
|
||||
}
|
||||
}
|
||||
else if (exclusive)
|
||||
{
|
||||
int32_t idx_out = outer * axisSize * innerSize + inner;
|
||||
buffer[1][idx_out] = sum;
|
||||
for (i = 0; i < axisSize - 1; ++i)
|
||||
{
|
||||
int32_t idx = (outer * axisSize + i) * innerSize + inner;
|
||||
float value = buffer[0][idx];
|
||||
idx_out = (outer * axisSize + i + 1) * innerSize + inner;
|
||||
sum += value;
|
||||
buffer[1][idx_out] = sum;
|
||||
}
|
||||
}
|
||||
else if (reverse)
|
||||
{
|
||||
for (i = axisSize - 1; i >= 0; i--)
|
||||
{
|
||||
int32_t idx = (outer * axisSize + i) * innerSize + inner;
|
||||
float value = buffer[0][idx];
|
||||
sum += value;
|
||||
buffer[1][idx] = sum;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = 0; i < axisSize; ++i)
|
||||
{
|
||||
// i * innerSize + inner + outer * innerSize * axisSize
|
||||
int32_t idx = (outer * axisSize + i) * innerSize + inner;
|
||||
float value = buffer[0][idx];
|
||||
sum += value;
|
||||
buffer[1][idx] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
|
||||
buffer[1], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
for ( i = 0; i < 2; i ++ )
|
||||
{
|
||||
if ( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
}
|
||||
for ( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
|
||||
}
|
||||
return status;
|
||||
} /* _cumsum_exec() */
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _cumsum_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _CUMSUM_PARAM_NUM _cnt_of_array( _cumsum_kernel_param_def )
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _cumsum_exec;
|
||||
kernel->info.parameters = _cumsum_kernel_param_def;
|
||||
kernel->info.numParams = _CUMSUM_PARAM_NUM;
|
||||
|
||||
return VSI_SUCCESS;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_FAILURE;
|
||||
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 2;
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" );
|
||||
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
|
||||
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
|
||||
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
|
||||
|
||||
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
|
||||
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
|
||||
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
||||
CHECK_STATUS( status );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[4] );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( cumsum, _setup )
|
||||
|
|
@ -50,6 +50,9 @@ typedef enum
|
|||
UNARY_HGELU,
|
||||
UNARY_SELU,
|
||||
UNARY_CELU,
|
||||
UNARY_RCP,
|
||||
UNARY_SIGN,
|
||||
UNARY_SOFTSIGN,
|
||||
} unary_type_e;
|
||||
|
||||
|
||||
|
|
@ -145,6 +148,21 @@ static float celu_eval(float x, float alpha)
|
|||
return positive + negative;
|
||||
}
|
||||
|
||||
static float rcp_eval(float x)
|
||||
{
|
||||
return 1 / x;
|
||||
}
|
||||
|
||||
static float sign_eval(float x)
|
||||
{
|
||||
return x > 0 ? 1.0f : x < 0 ? -1.0f : 0;
|
||||
}
|
||||
|
||||
static float softsign_eval(float x)
|
||||
{
|
||||
return x / (1.0f + vsi_abs(x));
|
||||
}
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
|
|
@ -227,6 +245,15 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
|
|||
case UNARY_CELU:
|
||||
data = celu_eval(data, alpha);
|
||||
break;
|
||||
case UNARY_RCP:
|
||||
data = rcp_eval(data);
|
||||
break;
|
||||
case UNARY_SIGN:
|
||||
data = sign_eval(data);
|
||||
break;
|
||||
case UNARY_SOFTSIGN:
|
||||
data = softsign_eval(data);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -360,4 +387,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu, UNARY_GELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu, UNARY_HGELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( selu, UNARY_SELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu, UNARY_CELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu, UNARY_CELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( rcp, UNARY_RCP )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( sign, UNARY_SIGN )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( softsign, UNARY_SOFTSIGN )
|
||||
|
|
@ -0,0 +1,284 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _CPU_ARG_NUM (8)
|
||||
#define _CPU_INPUT_NUM (1)
|
||||
#define _CPU_OUTPUT_NUM (2)
|
||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.maxpoolwithargmax")
|
||||
|
||||
#define FP32_MIN -3.4e38
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _MAXPOOLWITHARGMAX_PARAM_NUM _cnt_of_array( _maxpoolwithargmax_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_maxpoolwithargmax_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_FAILURE;
|
||||
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
|
||||
float * buffer[_CPU_IO_NUM] = { NULL };
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
|
||||
int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0;
|
||||
int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0;
|
||||
int32_t i = 0;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_x);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &ksize_y);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &stride_x);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &stride_y);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_left);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_right);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_top);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_bottom);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
|
||||
|
||||
buffer[1] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
|
||||
memset( buffer[1], 0, out_elements * sizeof(float) );
|
||||
|
||||
buffer[2] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
|
||||
memset( buffer[2], 0, out_elements * sizeof(float) );
|
||||
|
||||
{
|
||||
int32_t dims_num = (int32_t)attr[1]->shape->size;
|
||||
int32_t batch = dims_num > 3 ? (int32_t)attr[1]->shape->data[3] : 1;
|
||||
int32_t depth = dims_num > 2 ? (int32_t)attr[1]->shape->data[2] : 1;
|
||||
int32_t height_o = (int32_t)attr[1]->shape->data[1];
|
||||
int32_t width_o = (int32_t)attr[1]->shape->data[0];
|
||||
int32_t width = (int32_t)attr[0]->shape->data[0];
|
||||
int32_t height = (int32_t)attr[0]->shape->data[1];
|
||||
int32_t b = 0, d = 0, j = 0;
|
||||
int32_t output_base = 0;
|
||||
int32_t input_base = 0;
|
||||
|
||||
for (b = 0; b < batch; b++)
|
||||
{
|
||||
for (d = 0; d < depth; d++)
|
||||
{
|
||||
output_base = b * depth * height_o * width_o + d * height_o * width_o;
|
||||
input_base = b * depth * height * width + d * height * width;
|
||||
for (j = 0; j < height_o; j++)
|
||||
{
|
||||
for (i = 0; i < width_o; i++)
|
||||
{
|
||||
int32_t hstart = j * stride_y - pad_top;
|
||||
int32_t wstart = i * stride_x - pad_left;
|
||||
int32_t hend = vsi_nn_min(hstart + ksize_y, height);
|
||||
int32_t wend = vsi_nn_min(wstart + ksize_x, width);
|
||||
int32_t pool_index = output_base + j * width_o + i;
|
||||
int32_t h = 0, w = 0;
|
||||
int32_t index_max = 0;
|
||||
float value_max = (float)FP32_MIN;
|
||||
|
||||
hstart = vsi_nn_max(hstart, 0);
|
||||
wstart = vsi_nn_max(wstart, 0);
|
||||
|
||||
for (h = hstart; h < hend; ++ h)
|
||||
{
|
||||
for (w = wstart; w < wend; ++ w)
|
||||
{
|
||||
int32_t index = input_base + h * width + w;
|
||||
float data = buffer[0][index];
|
||||
|
||||
if (data > value_max)
|
||||
{
|
||||
value_max = data;
|
||||
index_max = index;
|
||||
}
|
||||
}
|
||||
}
|
||||
buffer[1][pool_index] = value_max;
|
||||
buffer[2][pool_index] = (float)index_max;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
|
||||
buffer[1], out_elements );
|
||||
status |= vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
|
||||
buffer[2], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
for ( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if ( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
}
|
||||
for ( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
|
||||
}
|
||||
return status;
|
||||
} /* _maxpoolwithargmax_exec() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _maxpoolwithargmax_exec;
|
||||
kernel->info.parameters = _maxpoolwithargmax_kernel_param_def;
|
||||
kernel->info.numParams = _MAXPOOLWITHARGMAX_PARAM_NUM;
|
||||
status = VSI_SUCCESS;
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x");
|
||||
int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y");
|
||||
int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
|
||||
int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
|
||||
int32_t pad_left = vsi_nn_kernel_param_get_int32(params, "pad_left");
|
||||
int32_t pad_right = vsi_nn_kernel_param_get_int32(params, "pad_right");
|
||||
int32_t pad_top = vsi_nn_kernel_param_get_int32(params, "pad_top");
|
||||
int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
int32_t index = 3;
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( maxpoolwithargmax, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,247 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.mod")
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _mod_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _MOD_PARAM_NUM _cnt_of_array( _mod_kernel_param_def )
|
||||
|
||||
static vsi_ssize_t _expand_offset
|
||||
(
|
||||
vsi_ssize_t index,
|
||||
vsi_size_t * shape, vsi_size_t rank,
|
||||
vsi_size_t * strides, vsi_size_t * out_shape
|
||||
)
|
||||
{
|
||||
vsi_size_t i;
|
||||
vsi_ssize_t offset = 0;
|
||||
|
||||
for( i = 0; i < rank && index; i ++ )
|
||||
{
|
||||
if( shape[i] == out_shape[i] )
|
||||
{
|
||||
offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
|
||||
}
|
||||
index /= out_shape[i];
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
int32_t isfmod = 0;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = F16;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float* f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float* f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t* in_attr[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t* out_attr[_OUTPUT_NUM] = {NULL};
|
||||
vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
vsi_size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
vsi_size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
uint32_t i;
|
||||
|
||||
/* prepare data */
|
||||
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &isfmod);
|
||||
for (i = 0; i < _INPUT_NUM; i++) {
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create(input[i]);
|
||||
vsi_nn_kernel_tensor_attr_get_stride(in_attr[i], in_stride_size[i]);
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer(input[i], in_attr[i], TRUE);
|
||||
CHECK_PTR_FAIL_GOTO(f32_in_buffer[i], "Create input0 buffer fail.", final);
|
||||
}
|
||||
|
||||
input0_dtype = in_attr[0]->dtype;
|
||||
if (input0_dtype == F16 || input0_dtype == F32 || input0_dtype == BF16) {
|
||||
isfmod = 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create(output[i]);
|
||||
vsi_nn_kernel_tensor_attr_get_stride(out_attr[i], out_stride_size[i]);
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size(out_attr[i]);
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
f32_out_buffer[i] = (float*)malloc(out_bytes[i]);
|
||||
CHECK_PTR_FAIL_GOTO(f32_out_buffer[i], "Create output buffer fail.", final);
|
||||
memset(f32_out_buffer[i], 0, out_bytes[i]);
|
||||
}
|
||||
|
||||
for (i = 0; i < out_elements[0]; i++)
|
||||
{
|
||||
vsi_ssize_t in0_offset = 0;
|
||||
vsi_ssize_t in1_offset = 0;
|
||||
float in0 = 0;
|
||||
float in1 = 0;
|
||||
|
||||
in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size,
|
||||
in_stride_size[0], out_attr[0]->shape->data );
|
||||
in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size,
|
||||
in_stride_size[1], out_attr[0]->shape->data );
|
||||
in0 = f32_in_buffer[0][in0_offset];
|
||||
in1 = f32_in_buffer[1][in1_offset];
|
||||
if (isfmod)
|
||||
{
|
||||
f32_out_buffer[0][i] = (float)fmod(in0,in1);
|
||||
}
|
||||
else
|
||||
{
|
||||
f32_out_buffer[0][i] = in0 - in1 * (float)floor(in0 / in1);
|
||||
}
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for (i = 0; i < _OUTPUT_NUM; i++) {
|
||||
status = vsi_nn_kernel_tensor_write_from_float(
|
||||
output[i], out_attr[i], f32_out_buffer[i], out_elements[i]);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++) {
|
||||
if (f32_in_buffer[i]) {
|
||||
free(f32_in_buffer[i]);
|
||||
f32_in_buffer[i] = NULL;
|
||||
}
|
||||
|
||||
if (in_attr[i]) {
|
||||
vsi_nn_kernel_tensor_attr_release(&in_attr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i++) {
|
||||
if (f32_out_buffer[i]) {
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
|
||||
if (out_attr[i]) {
|
||||
vsi_nn_kernel_tensor_attr_release(&out_attr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _mod_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _mod_kernel_param_def );
|
||||
|
||||
return VSI_SUCCESS;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( mod, _setup )
|
||||
|
||||
|
|
@ -55,8 +55,8 @@ __BEGIN_DECLS
|
|||
static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
|
|
@ -90,12 +90,16 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
uint32_t i = 0;
|
||||
int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
|
||||
float mean[3] = {0}, scale = 1;
|
||||
vsi_bool is_rgb888 = tensors[1] == NULL;
|
||||
|
||||
for (i = 0; i < _CPU_IO_NUM; i++)
|
||||
{
|
||||
tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
|
||||
if (tensors[i])
|
||||
{
|
||||
attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
|
||||
}
|
||||
}
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
|
||||
|
|
@ -113,8 +117,11 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
|
||||
if (tensors[i])
|
||||
{
|
||||
buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
|
||||
}
|
||||
|
||||
buffer[i + 3] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[i + 3], "Create output buffer fail.", final );
|
||||
|
|
@ -125,12 +132,17 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
int32_t line1[2], line2[2];
|
||||
int32_t dx = 0, dy = 0, idx = 0;
|
||||
int32_t src_width = (int32_t)attr[0]->shape->data[0];
|
||||
int32_t src_height = (int32_t)attr[0]->shape->data[1];
|
||||
int32_t dst_width = (int32_t)attr[3]->shape->data[0];
|
||||
int32_t dst_height = (int32_t)attr[3]->shape->data[1];
|
||||
uint8_t result = 0;
|
||||
int32_t offset = 0;
|
||||
int32_t index = 0;
|
||||
|
||||
for ( idx = 0; idx < 3; idx ++)
|
||||
{
|
||||
offset = is_rgb888 ? idx * src_width * src_height : 0;
|
||||
index = is_rgb888 ? 0 : idx;
|
||||
for ( dy = 0; dy < (int32_t)dst_height; dy ++)
|
||||
{
|
||||
for ( dx = 0; dx < (int32_t)dst_width; dx ++)
|
||||
|
|
@ -170,10 +182,10 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
sy += yOffset;
|
||||
source_index = (sx + sy * src_width);
|
||||
|
||||
line1[0] = (int32_t)buffer[idx][source_index];
|
||||
line1[1] = (int32_t)buffer[idx][source_index + 1];
|
||||
line2[0] = (int32_t)buffer[idx][source_index + src_width];
|
||||
line2[1] = (int32_t)buffer[idx][source_index + src_width + 1];
|
||||
line1[0] = (int32_t)buffer[index][source_index + offset];
|
||||
line1[1] = (int32_t)buffer[index][source_index + 1 + offset];
|
||||
line2[0] = (int32_t)buffer[index][source_index + src_width + offset];
|
||||
line2[1] = (int32_t)buffer[index][source_index + src_width + 1 + offset];
|
||||
|
||||
temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10);
|
||||
temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10);
|
||||
|
|
@ -184,10 +196,10 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
}
|
||||
else
|
||||
{
|
||||
int32_t offset = xOffset + yOffset * src_width;
|
||||
source_index = dx + dy * src_width + offset;
|
||||
finalVal = (buffer[0][source_index] - mean[idx]) * scale;
|
||||
buffer[1][output_index] = finalVal;
|
||||
int32_t ofset = xOffset + yOffset * src_width;
|
||||
source_index = dx + dy * src_width + ofset + offset;
|
||||
finalVal = (buffer[index][source_index] - mean[idx]) * scale;
|
||||
buffer[idx + 3][output_index] = finalVal;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -209,16 +209,15 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
for (n = 0; n < num_rois; n++)
|
||||
{
|
||||
uint32_t batchId = (uint32_t)f32_in_buffer[2][n];
|
||||
float scale = (in_attr[1]->dtype == U16) ? 0.125f : 1.0f;
|
||||
float qx1 = f32_in_buffer[1][n * kRoiDim];
|
||||
float qy1 = f32_in_buffer[1][n * kRoiDim + 1];
|
||||
float qx2 = f32_in_buffer[1][n * kRoiDim + 2];
|
||||
float qy2 = f32_in_buffer[1][n * kRoiDim + 3];
|
||||
|
||||
float x1 = qx1 * scale;
|
||||
float x2 = qx2 * scale;
|
||||
float y1 = qy1 * scale;
|
||||
float y2 = qy2 * scale;
|
||||
float x1 = qx1;
|
||||
float x2 = qx2;
|
||||
float y1 = qy1;
|
||||
float y2 = qy2;
|
||||
float roi_anchor_x = x1 * width_scale;
|
||||
float roi_anchor_y = y1 * height_scale;
|
||||
float roi_dims_x = vsi_nn_max((x2 - x1) * width_scale, 1.0f);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,770 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
|
||||
#define KERNEL_SOURCE_1 "cumsum"
|
||||
#define KERNEL_SOURCE_2 "cumsum_2d"
|
||||
#define KERNEL_SOURCE_3 "cumsum_bf16"
|
||||
#define KERNEL_SOURCE_4 "cumsum_f16_u8"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
|
||||
((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
|
||||
|
||||
#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
|
||||
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
|
||||
SOURCE },
|
||||
|
||||
#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
|
||||
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
|
||||
CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
|
||||
SOURCE },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} cumsum_map[] =
|
||||
{
|
||||
HASH_CUMSUM_KERNELS(0, U8, U8, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(0, I8, I8, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(0, I16, I16, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(0, F16, F16, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(0, BF16, BF16, KERNEL_SOURCE_3)
|
||||
HASH_CUMSUM_KERNELS(1, U8, U8, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(1, I8, I8, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(1, I16, I16, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(1, F16, F16, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(1, BF16, BF16, KERNEL_SOURCE_3)
|
||||
HASH_CUMSUM_KERNELS(2, U8, U8, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(2, I8, I8, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(2, I16, I16, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(2, F16, F16, KERNEL_SOURCE_1)
|
||||
HASH_CUMSUM_KERNELS(2, BF16, BF16, KERNEL_SOURCE_3)
|
||||
HASH_CUMSUM_KERNELS_2D(0, U8, U8, KERNEL_SOURCE_2)
|
||||
HASH_CUMSUM_KERNELS_2D(0, I8, I8, KERNEL_SOURCE_2)
|
||||
HASH_CUMSUM_KERNELS_2D(0, I16, I16, KERNEL_SOURCE_2)
|
||||
HASH_CUMSUM_KERNELS_2D(0, F16, F16, KERNEL_SOURCE_2)
|
||||
HASH_CUMSUM_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_3)
|
||||
HASH_CUMSUM_KERNELS_2D(1, U8, U8, KERNEL_SOURCE_2)
|
||||
HASH_CUMSUM_KERNELS_2D(1, I8, I8, KERNEL_SOURCE_2)
|
||||
HASH_CUMSUM_KERNELS_2D(1, I16, I16, KERNEL_SOURCE_2)
|
||||
HASH_CUMSUM_KERNELS_2D(1, F16, F16, KERNEL_SOURCE_2)
|
||||
HASH_CUMSUM_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_3)
|
||||
HASH_CUMSUM_KERNELS(0, F16, U8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS(0, F16, I8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS(0, F16, I16, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS(1, F16, U8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS(1, F16, I8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS(1, F16, I16, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS(2, F16, U8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS(2, F16, I8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS(2, F16, I16, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS_2D(0, F16, U8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS_2D(0, F16, I8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS_2D(0, F16, I16, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS_2D(1, F16, U8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS_2D(1, F16, I8, KERNEL_SOURCE_4)
|
||||
HASH_CUMSUM_KERNELS_2D(1, F16, I16, KERNEL_SOURCE_4)
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _cumsum_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _CUMSUM_PARAM_NUM _cnt_of_array( _cumsum_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_cumsum_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
int32_t axis = 0;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
int32_t channel = 0;
|
||||
int32_t w = 1;
|
||||
int32_t h = 1;
|
||||
int32_t c = 1;
|
||||
uint32_t dim = 1;
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
|
||||
vsi_size_array_t * input_shape = NULL;
|
||||
int32_t input_zp = 0;
|
||||
float input_scale = 1.0f;
|
||||
float output_zp = 0;
|
||||
float output_scale = 1.0f;
|
||||
float in_out_zp_scale = 1.0f;
|
||||
float in_out_scale = 1.0f;
|
||||
|
||||
uint32_t pack_key = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input_scale = attr[0]->asymm.scale;
|
||||
input_zp = attr[0]->asymm.zero_point;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
output_scale = (float)((int64_t)1 << attr[1]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
|
||||
}
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
output_scale = 1.0f / attr[1]->asymm.scale;
|
||||
output_zp = (float)attr[1]->asymm.zero_point;
|
||||
}
|
||||
|
||||
in_out_scale = input_scale * output_scale;
|
||||
in_out_zp_scale = (float)in_out_scale * input_zp;
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
dim = (uint32_t)input_shape->size;
|
||||
width = (int32_t)(input_shape->data[0]);
|
||||
height = (int32_t)(input_shape->data[1]);
|
||||
channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
|
||||
|
||||
|
||||
if (axis == 0)
|
||||
{
|
||||
w = 1;
|
||||
h = height;
|
||||
c = channel;
|
||||
}
|
||||
else if (axis == 1)
|
||||
{
|
||||
w = width;
|
||||
h = 1;
|
||||
c = channel;
|
||||
}
|
||||
else if (axis == 2)
|
||||
{
|
||||
w = width;
|
||||
h = height;
|
||||
c = 1;
|
||||
}
|
||||
|
||||
shaderParam.global_scale[0] = 8;
|
||||
if ((attr[0]->dtype == U8 || attr[0]->dtype == I8)
|
||||
&& (axis > 0))
|
||||
{
|
||||
shaderParam.global_scale[0] = 16;
|
||||
}
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0];
|
||||
shaderParam.global_size[1] = h;
|
||||
shaderParam.global_size[2] = c;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM) \
|
||||
(IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24))
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, axis, dim);
|
||||
|
||||
{
|
||||
uint16_t M0 = 0;
|
||||
int32_t postShift = 0;
|
||||
uint32_t multAndoutZP0[2] = {0};
|
||||
gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
|
||||
0xdddddddd, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x13121110, 0x17161514, // ABin
|
||||
0x11111111, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniAccSumVertF16toF16_2x8 = {{
|
||||
0x55555555, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x33221100, 0x77665544, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAccSumVertU8toI32A_4x4 = {{
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00110000, 0x00330022, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAccSumVertU8toI32B_4x4 = {{
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00150004, 0x00370026, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAccSumVertU8toI32C_4x4 = {{
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00190008, 0x003b002a, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAccSumVertU8toI32D_4x4 = {{
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x001d000c, 0x003f002e, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniSumHorzF16toF16A_4x4 = {{
|
||||
0x55150501, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00100000, 0x32100210, // ABin
|
||||
0xaa2a0a02, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x3c003c00, 0x00000000,
|
||||
0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniSumHorzF16toF16B_4x4 = {{
|
||||
0x55150501, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00540004, 0x76540654, // ABin
|
||||
0xaa2a0a02, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x3c003c00, 0x00000000,
|
||||
0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniSumHorzF16toF16C_2x8 = {{
|
||||
0x55551111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x37363534, // ABin
|
||||
0xaaaa2222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAccSumHorzF16toF16_2x8 = {{
|
||||
0x55555555, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x73727170, 0x77767574, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
|
||||
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniSumHorzU8toI16A_4x4 = {{
|
||||
0x55150501, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00100000, 0x32100210, // ABin
|
||||
0xaa2a0a02, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000700, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000001, 0x00010001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniSumHorzU8toI16B_8x4 = {{
|
||||
0x05550155, 0x55551555, // TCfg
|
||||
0x00418820, 0x41882000, 0x8820000a, 0x20018a41, 0x398a4188, // BinSelect
|
||||
0x00000700, // AccumType, ConstantType, and PostShift
|
||||
0x01010101, 0x00000001, 0x01010101, 0x00000101,
|
||||
0x01010101, 0x00010101, 0x01010101, 0x01010101 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniSubZpI16toI16_2x8 = {{
|
||||
0x99999999, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00020001, 0x00030001, 0x00040001,
|
||||
0x00050001, 0x00060001, 0x00070001, 0x00080001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAccSumHorzI16toI32A_4x4 = {{
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00310030, 0x00330032, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniAccSumHorzI16toI32B_4x4 = {{
|
||||
0x0d0d0d0d, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00350034, 0x00370036, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_dp_inst_t uniSetZeroF16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_quantize_multiplier_16bit( (double)input_scale * output_scale, &M0, &postShift);
|
||||
multAndoutZP0[0] = (uint32_t)(M0);
|
||||
multAndoutZP0[1] = (uint32_t)((attr[1]->asymm.zero_point << postShift) - input_zp * M0);
|
||||
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
case _PACK_SELECT_KEY( U8, U8, 2, 3):
|
||||
case _PACK_SELECT_KEY( I8, I8, 2, 3):
|
||||
case _PACK_SELECT_KEY( I16, I16, 2, 3):
|
||||
case _PACK_SELECT_KEY( F16, F16, 2, 3):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, U8, 0, 2):
|
||||
case _PACK_SELECT_KEY( U8, U8, 1, 2):
|
||||
case _PACK_SELECT_KEY( U8, U8, 0, 3):
|
||||
case _PACK_SELECT_KEY( U8, U8, 1, 3):
|
||||
case _PACK_SELECT_KEY( I8, I8, 0, 2):
|
||||
case _PACK_SELECT_KEY( I8, I8, 1, 2):
|
||||
case _PACK_SELECT_KEY( I8, I8, 0, 3):
|
||||
case _PACK_SELECT_KEY( I8, I8, 1, 3):
|
||||
case _PACK_SELECT_KEY( I16, I16, 0, 2):
|
||||
case _PACK_SELECT_KEY( I16, I16, 1, 2):
|
||||
case _PACK_SELECT_KEY( I16, I16, 0, 3):
|
||||
case _PACK_SELECT_KEY( I16, I16, 1, 3):
|
||||
case _PACK_SELECT_KEY( F16, F16, 0, 2):
|
||||
case _PACK_SELECT_KEY( F16, F16, 1, 2):
|
||||
case _PACK_SELECT_KEY( F16, F16, 0, 3):
|
||||
case _PACK_SELECT_KEY( F16, F16, 1, 3):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( BF16, BF16, 0, 2):
|
||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 2):
|
||||
case _PACK_SELECT_KEY( BF16, BF16, 0, 3):
|
||||
case _PACK_SELECT_KEY( BF16, BF16, 1, 3):
|
||||
case _PACK_SELECT_KEY( BF16, BF16, 2, 3):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, U8, 0, 2):
|
||||
case _PACK_SELECT_KEY( F16, U8, 1, 2):
|
||||
case _PACK_SELECT_KEY( F16, U8, 0, 3):
|
||||
case _PACK_SELECT_KEY( F16, U8, 1, 3):
|
||||
case _PACK_SELECT_KEY( F16, U8, 2, 3):
|
||||
case _PACK_SELECT_KEY( F16, I8, 0, 2):
|
||||
case _PACK_SELECT_KEY( F16, I8, 1, 2):
|
||||
case _PACK_SELECT_KEY( F16, I8, 0, 3):
|
||||
case _PACK_SELECT_KEY( F16, I8, 1, 3):
|
||||
case _PACK_SELECT_KEY( F16, I8, 2, 3):
|
||||
case _PACK_SELECT_KEY( F16, I16, 0, 2):
|
||||
case _PACK_SELECT_KEY( F16, I16, 1, 2):
|
||||
case _PACK_SELECT_KEY( F16, I16, 0, 3):
|
||||
case _PACK_SELECT_KEY( F16, I16, 1, 3):
|
||||
case _PACK_SELECT_KEY( F16, I16, 2, 3):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "multAndoutZP0", &multAndoutZP0);
|
||||
status |= vsi_nn_kernel_gpu_add_param(
|
||||
node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
#undef _PACK_SELECT_KEY
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
int32_t axis,
|
||||
int32_t is_2d
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
uint32_t key = 0;
|
||||
int i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
|
||||
|
||||
for( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
|
||||
{
|
||||
if ( cumsum_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(cumsum_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", cumsum_map[i].function_name );
|
||||
kernel->info.parameters = _cumsum_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def );
|
||||
kernel->info.initialize = _cumsum_initializer;
|
||||
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
cumsum_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
cumsum_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_size_t shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" );
|
||||
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
|
||||
int32_t axis_new = 0;
|
||||
int32_t is_2d = 0;
|
||||
uint32_t rs_dim = 2;
|
||||
int32_t i = 0;
|
||||
|
||||
vsi_nn_kernel_optimize_softmax_shape(
|
||||
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
|
||||
shapes[0], &rs_dim, &axis_new);
|
||||
if (exclusive || reverse || rs_dim > 3)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (rs_dim == 2)
|
||||
{
|
||||
is_2d = 1;
|
||||
}
|
||||
|
||||
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], shapes[0], (vsi_size_t)rs_dim );
|
||||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[0], (vsi_size_t)rs_dim );
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 2;
|
||||
|
||||
/* Pass parameters to node. */
|
||||
vsi_nn_kernel_node_pack_io( tmp_params, _CUMSUM_PARAM_NUM,
|
||||
reshape_tensors, 1, &reshape_tensors[1], 1 );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
|
||||
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
|
||||
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _CUMSUM_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &tmp_params[4] );
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( cumsum, _setup )
|
||||
|
|
@ -53,6 +53,9 @@ typedef enum
|
|||
UNARY_HGELU,
|
||||
UNARY_SELU,
|
||||
UNARY_CELU,
|
||||
UNARY_RCP,
|
||||
UNARY_SIGN,
|
||||
UNARY_SOFTSIGN,
|
||||
} unary_type_e;
|
||||
|
||||
/*
|
||||
|
|
@ -94,6 +97,34 @@ typedef enum
|
|||
#define HGELU_OPERATION hard_gelu
|
||||
#define SELU_OPERATION selu
|
||||
#define CELU_OPERATION celu
|
||||
#define RCP_OPERATION rcp
|
||||
#define SIGN_OPERATION sign
|
||||
#define SOFTSIGN_OPERATION softsign
|
||||
|
||||
#define ADD_UNARY_SH_KERNELS(name, source) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, F16, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, F16, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, I16, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, I16, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, U8, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, U8, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, I8, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, I8, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16, I16, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16, I16, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16, F16, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16, F16, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8, I8, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8, I8, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8, F16, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8, F16, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8, U8, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8, U8, source##_2D) \
|
||||
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8, F16, source##_3D) \
|
||||
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8, F16, source##_2D) \
|
||||
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
|
|
@ -101,269 +132,22 @@ static const struct {
|
|||
const char* source_name;
|
||||
} _eltwise_unary_evis_kernel_map[] =
|
||||
{
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, U8, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, U8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I8, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, U8, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, U8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I8, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, BF16, BF16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, U8, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, U8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I8, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, U8, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, U8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I8, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, U8, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, U8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I8, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, BF16, BF16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16, I16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8, U8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8, I8, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8, F16, KERNEL_SOURCE1_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_3D)
|
||||
ADD_UNARY_SH_KERNELS(SIN, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(COS, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(EXP, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(LOG, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(SELU, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(CELU, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(NEG, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(RCP, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(SIGN, KERNEL_SOURCE1)
|
||||
ADD_UNARY_SH_KERNELS(SOFTSIGN, KERNEL_SOURCE1)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, BF16, BF16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, BF16, BF16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16, I16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8, U8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8, I8, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8, F16, KERNEL_SOURCE1_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_2D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, U8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, I16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, I8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_3D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_2D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, I16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, U8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, I8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16, I16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8, U8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8, I8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_3D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_2D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, I16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, U8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, I8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16, I16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8, U8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8, I8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_3D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_2D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, I16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, U8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, I8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16, I16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8, U8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8, I8, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8, F16, KERNEL_SOURCE0_3D)
|
||||
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_3D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_2D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, I16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, I8, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, F16, KERNEL_SOURCE0_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16, KERNEL_SOURCE0_2D)
|
||||
ADD_UNARY_SH_KERNELS(HSIGMOID, KERNEL_SOURCE0)
|
||||
ADD_UNARY_SH_KERNELS(MISH, KERNEL_SOURCE0)
|
||||
ADD_UNARY_SH_KERNELS(ROUND, KERNEL_SOURCE0)
|
||||
ADD_UNARY_SH_KERNELS(GELU, KERNEL_SOURCE0)
|
||||
ADD_UNARY_SH_KERNELS(HGELU, KERNEL_SOURCE0)
|
||||
};
|
||||
|
||||
#undef SIN_OPERATION
|
||||
|
|
@ -378,6 +162,9 @@ static const struct {
|
|||
#undef GELU_OPERATION
|
||||
#undef HGELU_OPERATION
|
||||
#undef CELU_OPERATION
|
||||
#undef RCP_OPERATION
|
||||
#undef SIGN_OPERATION
|
||||
#undef SOFTSIGN_OPERATION
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
|
|
@ -509,6 +296,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
|
|||
case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_SOFTSIGN, BF16, BF16 ):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
|
|
@ -815,5 +605,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sign, UNARY_SIGN )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( softsign, UNARY_SOFTSIGN )
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -222,7 +222,7 @@ static vsi_status get_gather_tensor_reshape_size
|
|||
uint32_t i = 0;
|
||||
vsi_size_t elementCnt = 1;
|
||||
vsi_size_t outerCnt = 1;
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
for(i = 0; i < dims_num - batch_dims; ++i)
|
||||
{
|
||||
|
|
@ -751,7 +751,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
#define VSI_NN_MAX_BLOCK_SIZE (65536)
|
||||
#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
|
@ -795,12 +795,6 @@ static vsi_nn_kernel_node_t _setup
|
|||
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[2], rs_dim );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array, is_batch);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -136,7 +136,7 @@ static vsi_status get_gather_nd_tensor_reshape_size
|
|||
vsi_size_t *input_size = inputs[0]->attr.size;
|
||||
uint32_t i = 0;
|
||||
vsi_size_t elementCnt = 1;
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
newDim[0] = 0;
|
||||
for(i = 0; i < dims_num; ++i)
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -44,7 +44,7 @@ __BEGIN_DECLS
|
|||
typedef enum _grucell_nn_activation_type_e
|
||||
{
|
||||
SIGMOID = VSI_NN_ACT_SIGMOID,
|
||||
HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
|
||||
HSIGMOID = VSI_NN_ACT_HARD_SIGMOID,
|
||||
}grucell_nn_activation_type_e;
|
||||
|
||||
#define _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE "grucell_activation_z_h"
|
||||
|
|
@ -72,6 +72,10 @@ static const _kernel_map_type _grucell_activation_z_h_kernel_map[] =
|
|||
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, U8, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, I8, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -22,7 +22,6 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -45,7 +44,7 @@ __BEGIN_DECLS
|
|||
typedef enum _grucell_nn_activation_type_e
|
||||
{
|
||||
SIGMOID = VSI_NN_ACT_SIGMOID,
|
||||
HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
|
||||
HSIGMOID = VSI_NN_ACT_HARD_SIGMOID,
|
||||
}grucell_nn_activation_type_e;
|
||||
|
||||
#define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE "grucell_h_times_activation_r"
|
||||
|
|
@ -72,9 +71,12 @@ static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] =
|
|||
PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
|
||||
PACK_KERNEL_MAP( U8, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I8, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ),
|
||||
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
|
|
@ -256,8 +258,6 @@ final:
|
|||
return status;
|
||||
} /* _grucell_h_times_activation_r_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
|
|
@ -313,7 +313,6 @@ static vsi_status _query_kernel
|
|||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -38,16 +38,24 @@
|
|||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
||||
#define HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d) \
|
||||
((AXIS << 28) | (IN1_DTYPE << 20) | (IN0_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
|
||||
|
||||
#define HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) \
|
||||
"l2normalizescale_axis"#AXIS
|
||||
#define KERNEL_SOURCE_1 "l2normalizescale_axis0"
|
||||
#define KERNEL_SOURCE_2 "l2normalizescale_axis0_2d"
|
||||
#define KERNEL_SOURCE_3 "l2normalizescale_axis1"
|
||||
|
||||
#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
|
||||
#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \
|
||||
{ HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
|
||||
CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D"), \
|
||||
HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) },
|
||||
SOURCE },
|
||||
|
||||
#define HASH_L2NORMALIZESCALE_KERNELS( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \
|
||||
{ HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE), \
|
||||
SOURCE },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
|
@ -58,20 +66,27 @@ typedef struct
|
|||
|
||||
static const _kernel_map_type _l2normalizescale_kernel_map[] =
|
||||
{
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16, KERNEL_SOURCE_2 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8, KERNEL_SOURCE_2 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16, KERNEL_SOURCE_2 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8, KERNEL_SOURCE_2 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16, KERNEL_SOURCE_2 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16, KERNEL_SOURCE_2 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16, KERNEL_SOURCE_2 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16, KERNEL_SOURCE_3 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8, KERNEL_SOURCE_3 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16, KERNEL_SOURCE_3 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8, KERNEL_SOURCE_3 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16, KERNEL_SOURCE_3 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16, KERNEL_SOURCE_3 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16, KERNEL_SOURCE_3 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS( 0, F16, F16, F16, KERNEL_SOURCE_1 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, I8, KERNEL_SOURCE_1 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, F16, KERNEL_SOURCE_1 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, U8, KERNEL_SOURCE_1 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, F16, KERNEL_SOURCE_1 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, I16, KERNEL_SOURCE_1 )
|
||||
HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, F16, KERNEL_SOURCE_1 )
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -119,6 +134,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
|
|||
int32_t outputZP = 0;
|
||||
float outputScale = 1.0f;
|
||||
float r_inputScale = 1.0f;
|
||||
float e2InScale = 1.0f;
|
||||
float inOutScale = 1.0f;
|
||||
int32_t axis2Dflg = 0;
|
||||
int32_t inputWidth = 0;
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
|
|
@ -168,7 +187,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
|
|||
outputScale = 1.0f / output_attr->asymm.scale;
|
||||
}
|
||||
|
||||
e2InScale = inputScale * inputScale;
|
||||
r_inputScale = 1.0f / inputScale;
|
||||
inOutScale = inputScale * outputScale;
|
||||
inputWidth = (int32_t)(output_shape->data[0]);
|
||||
|
||||
if (1 == axis)
|
||||
{
|
||||
|
|
@ -190,6 +212,13 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
|
|||
gpu_param.local_size[1] = 1;
|
||||
gpu_param.global_size[0] = 16;
|
||||
gpu_param.global_size[1] = output_shape->data[1];
|
||||
|
||||
if (output_shape->data[0] < GPU_TENSOR_MAX_WIDTH
|
||||
&& output_shape->data[1] < GPU_TENSOR_MAX_WIDTH
|
||||
&& (output_shape->size == 2 || (output_shape->size == 3 && output_shape->data[2] == 1)))
|
||||
{
|
||||
axis2Dflg = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -257,8 +286,105 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
|
|||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
|
||||
0x55555555, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x76543210, 0x76543210, // ABin
|
||||
0x5555aaaa, // BSelt
|
||||
0x00000000, 0x76543210, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
|
||||
0x55555555, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x76543210, 0x76543210, // ABin
|
||||
0x5555aaaa, // BSelt
|
||||
0x00000000, 0x76543210, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (1 == axis)
|
||||
if (axis2Dflg)
|
||||
{
|
||||
float zP2x = 2 * (float)inputZP;
|
||||
float zpSqr8x = 8 * (float)inputZP * (float)inputZP;
|
||||
float output_ZP = (float)outputZP;
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "zP2x", &zP2x);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "zpSqr8x", &zpSqr8x);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "e2InScale", &e2InScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "inputZP", &inputZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
|
||||
&uniConvert1stUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
|
||||
&uniConvert2ndUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (1 == axis)
|
||||
{
|
||||
int32_t L2NorS_depth = (int32_t)(output_shape->data[1]);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "L2NorS_depth", &L2NorS_depth);
|
||||
|
|
@ -277,8 +403,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
|
|||
}
|
||||
else if (0 == axis)
|
||||
{
|
||||
int32_t inputWidth, inputWidthCount, inputWidthRemain256;
|
||||
inputWidth = (int32_t)(output_shape->data[0]);
|
||||
int32_t inputWidthCount, inputWidthRemain256;
|
||||
inputWidthRemain256 = (int32_t)(output_shape->data[0] % 256);
|
||||
inputWidthCount = (int32_t)(output_shape->data[0] / 256);
|
||||
vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth);
|
||||
|
|
@ -298,7 +423,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
|
|||
}
|
||||
}
|
||||
|
||||
{
|
||||
if (axis2Dflg == 0)
|
||||
{
|
||||
float IntergerScale = inputScale;
|
||||
float output_ZP = (float)outputZP;
|
||||
gpu_dp_inst_t uniExtact8Bin_2x8 = {{
|
||||
|
|
@ -473,7 +599,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
|
||||
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1) &&
|
||||
(inputs[0]->attr.size[0] < GPU_TENSOR_MAX_WIDTH && inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH);
|
||||
status = _query_kernel( kernel, inputs, outputs, axis, image_2d );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -910,6 +910,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input01Scale", &inScaleMul );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "mulKIn0In1Zp", &mulKIn0In1Zp );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
|
|||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = (uint8_t)attr[2]->dfp.fl;
|
||||
int32_t fl = attr[2]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << fl);
|
||||
|
|
|
|||
|
|
@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
|
|||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = (uint8_t)attr[2]->dfp.fl;
|
||||
int32_t fl = attr[2]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
output_scale = (float) ((int64_t)1 << fl);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,444 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
|
||||
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
|
||||
|
||||
#define MOD_KERNEL_SOURCE_NAME "mod"
|
||||
|
||||
#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
|
||||
{ MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \
|
||||
MOD_KERNEL_SOURCE_NAME },
|
||||
|
||||
#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
|
||||
{ MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
|
||||
CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \
|
||||
MOD_KERNEL_SOURCE_NAME },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _mod_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
MOD_KERNELS( F16, F16, F16 )
|
||||
MOD_KERNELS( F16, F16, I16 )
|
||||
MOD_KERNELS( F16, F16, I8 )
|
||||
MOD_KERNELS( F16, F16, U8 )
|
||||
MOD_KERNELS( I16, I16, I16 )
|
||||
MOD_KERNELS( I8, I8, I8 )
|
||||
MOD_KERNELS( U8, U8, U8 )
|
||||
MOD_KERNELS( I16, I16, F16 )
|
||||
MOD_KERNELS( I8, I8, F16 )
|
||||
MOD_KERNELS( U8, U8, F16 )
|
||||
MOD_KERNELS( BF16, BF16, BF16 )
|
||||
|
||||
MOD_KERNELS_2D( F16, F16, F16 )
|
||||
MOD_KERNELS_2D( F16, F16, I16 )
|
||||
MOD_KERNELS_2D( F16, F16, I8 )
|
||||
MOD_KERNELS_2D( F16, F16, U8 )
|
||||
MOD_KERNELS_2D( I16, I16, I16 )
|
||||
MOD_KERNELS_2D( I8, I8, I8 )
|
||||
MOD_KERNELS_2D( U8, U8, U8 )
|
||||
MOD_KERNELS_2D( I16, I16, F16 )
|
||||
MOD_KERNELS_2D( I8, I8, F16 )
|
||||
MOD_KERNELS_2D( U8, U8, F16 )
|
||||
MOD_KERNELS_2D( BF16, BF16, BF16 )
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _mod_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _MOD_PARAM_NUM _cnt_of_array( _mod_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_mod_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vx_status status = VX_FAILURE;
|
||||
vx_tensor input0 = (vx_tensor)param[0];
|
||||
vx_tensor input1 = (vx_tensor)param[1];
|
||||
vx_tensor output = (vx_tensor)param[2];
|
||||
vsi_nn_kernel_tensor_attr_t *input0_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t *input1_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
|
||||
vsi_size_array_t *output_shape = NULL;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = F16;
|
||||
int32_t input0_fl = 0;
|
||||
int32_t input1_fl = 0;
|
||||
int32_t output_fl = 0;
|
||||
float inScale0 = 1.0f;
|
||||
float inScale1 = 1.0f;
|
||||
float outScale = 1.0f;
|
||||
float in0Tail = 0;
|
||||
float in1Tail = 0;
|
||||
float outZp = 0;
|
||||
|
||||
input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 );
|
||||
CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1 );
|
||||
CHECK_PTR_FAIL_GOTO( input1_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
output_shape = output_attr->shape;
|
||||
input0_dtype = input0_attr->dtype;
|
||||
|
||||
gpu_param.dim = output_shape->size < 3 ? 2 : 3;
|
||||
gpu_param.global_offset[0] = 0;
|
||||
gpu_param.global_offset[1] = 0;
|
||||
gpu_param.global_offset[2] = 0;
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = output_shape->size > 2 ?
|
||||
(output_shape->data[2] + gpu_param.global_scale[2] - 1)
|
||||
/ gpu_param.global_scale[2] : 1;
|
||||
|
||||
if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
input0_fl = input0_attr->dfp.fl;
|
||||
if (input0_fl > 0)
|
||||
{
|
||||
inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inScale0 = (float)((int64_t)1 << -input0_fl);
|
||||
}
|
||||
}
|
||||
else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
inScale0 = input0_attr->asymm.scale;
|
||||
in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point);
|
||||
}
|
||||
|
||||
if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
input1_fl = input1_attr->dfp.fl;
|
||||
if (input1_fl > 0)
|
||||
{
|
||||
inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inScale1 = (float)((int64_t)1 << -input1_fl);
|
||||
}
|
||||
}
|
||||
else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
inScale1 = input1_attr->asymm.scale;
|
||||
in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point);
|
||||
}
|
||||
|
||||
if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
outScale = (float) ((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outScale = 1.0f / (float)((int64_t)1 << -output_fl);
|
||||
}
|
||||
}
|
||||
else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outScale = 1.0f / output_attr->asymm.scale;
|
||||
outZp = (float)(output_attr->asymm.zero_point);
|
||||
}
|
||||
|
||||
if (BF16 == input0_dtype)
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "in0Tail", &in0Tail );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "in1Tail", &in1Tail );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp", &outZp );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
if (input0_attr)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release(&input0_attr);
|
||||
}
|
||||
if (input1_attr)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release(&input1_attr);
|
||||
}
|
||||
if (output_attr)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release(&output_attr);
|
||||
}
|
||||
return status;
|
||||
} /* _mod_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool image_2d
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _mod_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _mod_kernel_map );
|
||||
vx_param_description_t * param_def = _mod_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _mod_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _mod_initializer;
|
||||
|
||||
uint32_t key = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
|
||||
|
||||
for (i = 0; i < kernel_map_size; i ++)
|
||||
{
|
||||
if (kernel_map[i].key == key)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i < kernel_map_size)
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_bool image_2d = FALSE;
|
||||
int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
|
||||
|
||||
if (!vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ))
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1);
|
||||
if (vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == F16 ||
|
||||
vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == F16 ||
|
||||
vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == BF16 ||
|
||||
vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == BF16)
|
||||
{
|
||||
isfmod = 1;
|
||||
}
|
||||
status = _query_kernel( kernel, inputs, outputs, image_2d);
|
||||
if (VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if (node)
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( mod, _setup )
|
||||
|
||||
|
|
@ -38,69 +38,20 @@
|
|||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define VX_KERNEL_NAME_POW_F16F16TOF16 CVIVANTE_NAMESPACE("evis.pow_F16F16toF16")
|
||||
#define VX_KERNEL_NAME_POW_F16F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toF16_2D")
|
||||
#define VX_KERNEL_NAME_POW_F16F16TOU8 CVIVANTE_NAMESPACE("evis.pow_F16F16toU8")
|
||||
#define VX_KERNEL_NAME_POW_F16F16TOU8_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toU8_2D")
|
||||
#define VX_KERNEL_NAME_POW_F16F16TOI8 CVIVANTE_NAMESPACE("evis.pow_F16F16toI8")
|
||||
#define VX_KERNEL_NAME_POW_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toI8_2D")
|
||||
#define VX_KERNEL_NAME_POW_F16F16TOI16 CVIVANTE_NAMESPACE("evis.pow_F16F16toI16")
|
||||
#define VX_KERNEL_NAME_POW_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toI16_2D")
|
||||
#define VX_KERNEL_NAME_POW_F16U8TOF16 CVIVANTE_NAMESPACE("evis.pow_F16U8toF16")
|
||||
#define VX_KERNEL_NAME_POW_F16U8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16U8toF16_2D")
|
||||
#define VX_KERNEL_NAME_POW_F16I8TOF16 CVIVANTE_NAMESPACE("evis.pow_F16I8toF16")
|
||||
#define VX_KERNEL_NAME_POW_F16I8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16I8toF16_2D")
|
||||
#define VX_KERNEL_NAME_POW_F16I16TOF16 CVIVANTE_NAMESPACE("evis.pow_F16I16toF16")
|
||||
#define VX_KERNEL_NAME_POW_F16I16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16I16toF16_2D")
|
||||
#define VX_KERNEL_NAME_POW_F16U8TOU8 CVIVANTE_NAMESPACE("evis.pow_F16U8toU8")
|
||||
#define VX_KERNEL_NAME_POW_F16U8TOU8_2D CVIVANTE_NAMESPACE("evis.pow_F16U8toU8_2D")
|
||||
#define VX_KERNEL_NAME_POW_F16I8TOI8 CVIVANTE_NAMESPACE("evis.pow_F16I8toI8")
|
||||
#define VX_KERNEL_NAME_POW_F16I8TOI8_2D CVIVANTE_NAMESPACE("evis.pow_F16I8toI8_2D")
|
||||
#define VX_KERNEL_NAME_POW_F16I16TOI16 CVIVANTE_NAMESPACE("evis.pow_F16I16toI16")
|
||||
#define VX_KERNEL_NAME_POW_F16I16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_F16I16toI16_2D")
|
||||
#define VX_KERNEL_NAME_POW_U8F16TOF16 CVIVANTE_NAMESPACE("evis.pow_U8F16toF16")
|
||||
#define VX_KERNEL_NAME_POW_U8F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_U8F16toF16_2D")
|
||||
#define VX_KERNEL_NAME_POW_I8F16TOF16 CVIVANTE_NAMESPACE("evis.pow_I8F16toF16")
|
||||
#define VX_KERNEL_NAME_POW_I8F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_I8F16toF16_2D")
|
||||
#define VX_KERNEL_NAME_POW_I16F16TOF16 CVIVANTE_NAMESPACE("evis.pow_I16F16toF16")
|
||||
#define VX_KERNEL_NAME_POW_I16F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_I16F16toF16_2D")
|
||||
#define VX_KERNEL_NAME_POW_U8F16TOU8 CVIVANTE_NAMESPACE("evis.pow_U8F16toU8")
|
||||
#define VX_KERNEL_NAME_POW_U8F16TOU8_2D CVIVANTE_NAMESPACE("evis.pow_U8F16toU8_2D")
|
||||
#define VX_KERNEL_NAME_POW_I8F16TOI8 CVIVANTE_NAMESPACE("evis.pow_I8F16toI8")
|
||||
#define VX_KERNEL_NAME_POW_I8F16TOI8_2D CVIVANTE_NAMESPACE("evis.pow_I8F16toI8_2D")
|
||||
#define VX_KERNEL_NAME_POW_I16F16TOI16 CVIVANTE_NAMESPACE("evis.pow_I16F16toI16")
|
||||
#define VX_KERNEL_NAME_POW_I16F16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_I16F16toI16_2D")
|
||||
#define VX_KERNEL_NAME_POW_U8U8TOU8 CVIVANTE_NAMESPACE("evis.pow_U8U8toU8")
|
||||
#define VX_KERNEL_NAME_POW_U8U8TOU8_2D CVIVANTE_NAMESPACE("evis.pow_U8U8toU8_2D")
|
||||
#define VX_KERNEL_NAME_POW_I8I8TOI8 CVIVANTE_NAMESPACE("evis.pow_I8I8toI8")
|
||||
#define VX_KERNEL_NAME_POW_I8I8TOI8_2D CVIVANTE_NAMESPACE("evis.pow_I8I8toI8_2D")
|
||||
#define VX_KERNEL_NAME_POW_I16I16TOI16 CVIVANTE_NAMESPACE("evis.pow_I16I16toI16")
|
||||
#define VX_KERNEL_NAME_POW_I16I16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_I16I16toI16_2D")
|
||||
#define VX_KERNEL_NAME_POW_BF16BF16TOBF16 CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16")
|
||||
#define VX_KERNEL_NAME_POW_BF16BF16TOBF16_2D CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16_2D")
|
||||
#define VX_KERNEL_NAME_POW_U8U8TOF16 CVIVANTE_NAMESPACE("evis.pow_U8U8toF16")
|
||||
#define VX_KERNEL_NAME_POW_U8U8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_U8U8toF16_2D")
|
||||
|
||||
#define KERNEL_SOURCE_1 "pow_fp16",
|
||||
#define KERNEL_SOURCE_2 "pow_fp16_i8",
|
||||
#define KERNEL_SOURCE_3 "pow_fp16_i16",
|
||||
#define KERNEL_SOURCE_4 "pow_u8",
|
||||
#define KERNEL_SOURCE_5 "pow_i8",
|
||||
#define KERNEL_SOURCE_6 "pow_i16"
|
||||
|
||||
#define KERNEL_SOURCE "pow",
|
||||
|
||||
#define HASH_POW_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
|
||||
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
|
||||
|
||||
#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
|
||||
#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
|
||||
{ HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
|
||||
VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \
|
||||
SOURCE },
|
||||
CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE), \
|
||||
KERNEL_SOURCE },
|
||||
|
||||
#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
|
||||
#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
|
||||
{ HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
|
||||
VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \
|
||||
SOURCE },
|
||||
CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE"_2D"), \
|
||||
KERNEL_SOURCE },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
|
|
@ -108,59 +59,59 @@ static const struct {
|
|||
const char* source_name;
|
||||
} pow_map[] =
|
||||
{
|
||||
TENSOR_POW_KERNELS(F16, F16, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_POW_KERNELS(F16, F16, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_POW_KERNELS(F16, U8, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_POW_KERNELS(F16, U8, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_POW_KERNELS(F16, F16, F16)
|
||||
TENSOR_POW_KERNELS(F16, F16, U8)
|
||||
TENSOR_POW_KERNELS(F16, U8, F16)
|
||||
TENSOR_POW_KERNELS(F16, U8, U8)
|
||||
|
||||
TENSOR_POW_KERNELS(F16, F16, I8, KERNEL_SOURCE_2)
|
||||
TENSOR_POW_KERNELS(F16, I8, F16, KERNEL_SOURCE_2)
|
||||
TENSOR_POW_KERNELS(F16, I8, I8, KERNEL_SOURCE_2)
|
||||
TENSOR_POW_KERNELS(F16, F16, I8)
|
||||
TENSOR_POW_KERNELS(F16, I8, F16)
|
||||
TENSOR_POW_KERNELS(F16, I8, I8)
|
||||
|
||||
TENSOR_POW_KERNELS(F16, F16, I16, KERNEL_SOURCE_3)
|
||||
TENSOR_POW_KERNELS(F16, I16, F16, KERNEL_SOURCE_3)
|
||||
TENSOR_POW_KERNELS(F16, I16, I16, KERNEL_SOURCE_3)
|
||||
TENSOR_POW_KERNELS(F16, F16, I16)
|
||||
TENSOR_POW_KERNELS(F16, I16, F16)
|
||||
TENSOR_POW_KERNELS(F16, I16, I16)
|
||||
|
||||
TENSOR_POW_KERNELS(U8, F16, F16, KERNEL_SOURCE_4)
|
||||
TENSOR_POW_KERNELS(U8, F16, U8, KERNEL_SOURCE_4)
|
||||
TENSOR_POW_KERNELS(U8, U8, U8, KERNEL_SOURCE_4)
|
||||
TENSOR_POW_KERNELS(U8, U8, F16, KERNEL_SOURCE_4)
|
||||
TENSOR_POW_KERNELS(U8, F16, F16)
|
||||
TENSOR_POW_KERNELS(U8, F16, U8)
|
||||
TENSOR_POW_KERNELS(U8, U8, U8)
|
||||
TENSOR_POW_KERNELS(U8, U8, F16)
|
||||
|
||||
TENSOR_POW_KERNELS(I8, F16, F16, KERNEL_SOURCE_5)
|
||||
TENSOR_POW_KERNELS(I8, F16, I8, KERNEL_SOURCE_5)
|
||||
TENSOR_POW_KERNELS(I8, I8, I8, KERNEL_SOURCE_5)
|
||||
TENSOR_POW_KERNELS(I8, F16, F16)
|
||||
TENSOR_POW_KERNELS(I8, F16, I8)
|
||||
TENSOR_POW_KERNELS(I8, I8, I8)
|
||||
|
||||
TENSOR_POW_KERNELS(I16, F16, F16, KERNEL_SOURCE_6)
|
||||
TENSOR_POW_KERNELS(I16, F16, I16, KERNEL_SOURCE_6)
|
||||
TENSOR_POW_KERNELS(I16, I16, I16, KERNEL_SOURCE_6)
|
||||
TENSOR_POW_KERNELS(BF16, BF16, BF16, KERNEL_SOURCE_3)
|
||||
TENSOR_POW_KERNELS(I16, F16, F16)
|
||||
TENSOR_POW_KERNELS(I16, F16, I16)
|
||||
TENSOR_POW_KERNELS(I16, I16, I16)
|
||||
TENSOR_POW_KERNELS(BF16, BF16, BF16)
|
||||
|
||||
TENSOR_POW_KERNELS_2D(F16, F16, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_POW_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_POW_KERNELS_2D(F16, U8, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_POW_KERNELS_2D(F16, U8, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_POW_KERNELS_2D(F16, F16, F16)
|
||||
TENSOR_POW_KERNELS_2D(F16, F16, U8)
|
||||
TENSOR_POW_KERNELS_2D(F16, U8, F16)
|
||||
TENSOR_POW_KERNELS_2D(F16, U8, U8)
|
||||
|
||||
TENSOR_POW_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_2)
|
||||
TENSOR_POW_KERNELS_2D(F16, I8, F16, KERNEL_SOURCE_2)
|
||||
TENSOR_POW_KERNELS_2D(F16, I8, I8, KERNEL_SOURCE_2)
|
||||
TENSOR_POW_KERNELS_2D(F16, F16, I8)
|
||||
TENSOR_POW_KERNELS_2D(F16, I8, F16)
|
||||
TENSOR_POW_KERNELS_2D(F16, I8, I8)
|
||||
|
||||
TENSOR_POW_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3)
|
||||
TENSOR_POW_KERNELS_2D(F16, I16, F16, KERNEL_SOURCE_3)
|
||||
TENSOR_POW_KERNELS_2D(F16, I16, I16, KERNEL_SOURCE_3)
|
||||
TENSOR_POW_KERNELS_2D(F16, F16, I16)
|
||||
TENSOR_POW_KERNELS_2D(F16, I16, F16)
|
||||
TENSOR_POW_KERNELS_2D(F16, I16, I16)
|
||||
|
||||
TENSOR_POW_KERNELS_2D(U8, F16, F16, KERNEL_SOURCE_4)
|
||||
TENSOR_POW_KERNELS_2D(U8, F16, U8, KERNEL_SOURCE_4)
|
||||
TENSOR_POW_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_4)
|
||||
TENSOR_POW_KERNELS_2D(U8, U8, F16, KERNEL_SOURCE_4)
|
||||
TENSOR_POW_KERNELS_2D(U8, F16, F16)
|
||||
TENSOR_POW_KERNELS_2D(U8, F16, U8)
|
||||
TENSOR_POW_KERNELS_2D(U8, U8, U8)
|
||||
TENSOR_POW_KERNELS_2D(U8, U8, F16)
|
||||
|
||||
TENSOR_POW_KERNELS_2D(I8, F16, F16, KERNEL_SOURCE_5)
|
||||
TENSOR_POW_KERNELS_2D(I8, F16, I8, KERNEL_SOURCE_5)
|
||||
TENSOR_POW_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_5)
|
||||
TENSOR_POW_KERNELS_2D(I8, F16, F16)
|
||||
TENSOR_POW_KERNELS_2D(I8, F16, I8)
|
||||
TENSOR_POW_KERNELS_2D(I8, I8, I8)
|
||||
|
||||
TENSOR_POW_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_6)
|
||||
TENSOR_POW_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_6)
|
||||
TENSOR_POW_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_6)
|
||||
TENSOR_POW_KERNELS_2D(BF16, BF16, BF16, KERNEL_SOURCE_3)
|
||||
TENSOR_POW_KERNELS_2D(I16, F16, F16)
|
||||
TENSOR_POW_KERNELS_2D(I16, F16, I16)
|
||||
TENSOR_POW_KERNELS_2D(I16, I16, I16)
|
||||
TENSOR_POW_KERNELS_2D(BF16, BF16, BF16)
|
||||
};
|
||||
|
||||
static vx_param_description_t vxPowKernel_param_def[] =
|
||||
|
|
@ -186,24 +137,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
|
|||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
int8_t in0_fl = 0;
|
||||
int32_t src0ZP = 0;
|
||||
float src0Scale = 1.0f;
|
||||
int8_t in1_fl = 0;
|
||||
int32_t src1ZP = 0;
|
||||
float src1Scale = 1.0f;
|
||||
int8_t out_fl = 0;
|
||||
float dstZP = 0;
|
||||
float dstScale = 1.0f;
|
||||
float input0_scale = 1.0f;
|
||||
float input1_scale = 1.0f;
|
||||
float input0_tail = 0;
|
||||
float input1_tail = 0;
|
||||
float output_scale = 1.0f;
|
||||
float output_zp = 0;
|
||||
|
||||
int32_t postshift0 = 0;
|
||||
int32_t postshift1 = 0;
|
||||
float outScale_fl = 1;
|
||||
|
||||
uint16_t M0 = 0;
|
||||
uint16_t M1 = 0;
|
||||
|
||||
vsi_size_t zAx = 1;
|
||||
uint32_t pack_key = 0;
|
||||
// dim number ???
|
||||
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
|
||||
|
|
@ -220,58 +160,59 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
|
|||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
in0_fl = (int8_t)attr[0]->dfp.fl;
|
||||
postshift0 = in0_fl - 0;
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input0_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input0_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
src0ZP = attr[0]->asymm.zero_point;
|
||||
src0Scale = attr[0]->asymm.scale;
|
||||
|
||||
gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postshift0);
|
||||
input0_scale = attr[0]->asymm.scale;
|
||||
input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
in1_fl = (int8_t)attr[1]->dfp.fl;
|
||||
postshift1 = in1_fl - 0;
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input1_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input1_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
|
||||
{
|
||||
src1ZP = attr[1]->asymm.zero_point;
|
||||
src1Scale = attr[1]->asymm.scale;
|
||||
|
||||
gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postshift1);
|
||||
input1_scale = attr[1]->asymm.scale;
|
||||
input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale;
|
||||
}
|
||||
|
||||
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
out_fl = (int8_t)attr[2]->dfp.fl;
|
||||
if (out_fl > 0)
|
||||
int32_t fl = attr[2]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
outScale_fl = (vx_float32)((int64_t)1 << out_fl);
|
||||
output_scale = (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outScale_fl = (1.0f / (vx_float32)((int64_t)1 << -out_fl));
|
||||
output_scale = 1.0f / (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|
||||
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
|
||||
{
|
||||
dstZP = (float)attr[2]->asymm.zero_point;
|
||||
dstScale = 1.0f / attr[2]->asymm.scale;
|
||||
}
|
||||
|
||||
if ( out_shape->size < 3 )
|
||||
{
|
||||
zAx = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
zAx = out_shape->data[2];
|
||||
output_zp = (float)attr[2]->asymm.zero_point;
|
||||
output_scale = 1.0f / attr[2]->asymm.scale;
|
||||
}
|
||||
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
|
||||
|
|
@ -287,269 +228,122 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
|
|||
/ shaderParam.global_scale[0], 4);
|
||||
shaderParam.global_size[1] = gpu_align_p2((out_shape->data[1] + shaderParam.global_scale[1] - 1)
|
||||
/ shaderParam.global_scale[1], 2);
|
||||
shaderParam.global_size[2] = gpu_align_p2((zAx + shaderParam.global_scale[2] - 1)
|
||||
/ shaderParam.global_scale[2], 1);
|
||||
shaderParam.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertFstDataToFp32_4x4_2 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertSecDataToFp32_4x4_2 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4 = {{
|
||||
0x09090909, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4_2 = {{
|
||||
0x09090909, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4_2 = {{
|
||||
0x09090909, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x0a0a0a0a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000,
|
||||
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
uint32_t multiplierA = (M0 << 16) | M0;
|
||||
uint32_t multiplierB = (M1 << 16) | M1;
|
||||
int32_t i = 8;
|
||||
|
||||
uniConvertUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F);
|
||||
uniConvertSecUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F);
|
||||
uniConvertUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F);
|
||||
uniConvertSecUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F);
|
||||
for ( i = 8; i < 16; i += 2 )
|
||||
case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
|
||||
{
|
||||
uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA;
|
||||
uniConvertSecUint8SubZpToFp32_4x4.data[i] = multiplierA;
|
||||
uniConvertUint8SubZpToFp32_4x4_2.data[i] = multiplierB;
|
||||
uniConvertSecUint8SubZpToFp32_4x4_2.data[i] = multiplierB;
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
|
||||
&uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
|
||||
&uniConvBF16toF32_Part1_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
|
||||
&uniExtractOddData_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
|
||||
if ( attr[0]->dtype == I8 || attr[0]->dtype == I16 )
|
||||
break;
|
||||
default:
|
||||
{
|
||||
gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4, postshift0 );
|
||||
gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4, postshift0 );
|
||||
}
|
||||
gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtact8Bit_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtactHalf8_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
if ( attr[1]->dtype == I8 || attr[1]->dtype == I16 )
|
||||
{
|
||||
gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4_2, postshift1 );
|
||||
gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4_2, postshift1 );
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
|
||||
&uniConvertFstDataToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
|
||||
&uniConvertSecDataToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input0_scale", &input0_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input1_scale", &input1_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input0_tail", &input0_tail);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input1_tail", &input1_tail);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
|
||||
if (attr[2]->dtype == F16)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8",
|
||||
&uniExtactHalf8_2x8);
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8",
|
||||
&uniExtact8Bit_2x8);
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
case _PACK_SELECT_KEY( F16, F16, I8 ):
|
||||
case _PACK_SELECT_KEY( F16, I8, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, I8, I8 ):
|
||||
case _PACK_SELECT_KEY( F16, F16, I16 ):
|
||||
case _PACK_SELECT_KEY( F16, I16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, I16, I16 ):
|
||||
case _PACK_SELECT_KEY( I8, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( I8, F16, I8 ):
|
||||
case _PACK_SELECT_KEY( I8, I8, I8 ):
|
||||
case _PACK_SELECT_KEY( I16, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( I16, F16, I16 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, I16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
|
||||
&uniConvertFstDataToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
|
||||
&uniConvertSecDataToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2",
|
||||
&uniConvertFstDataToFp32_4x4_2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2",
|
||||
&uniConvertSecDataToFp32_4x4_2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outScale_fl", &outScale_fl);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( U8, F16, U8 ):
|
||||
case _PACK_SELECT_KEY( U8, U8, U8 ):
|
||||
case _PACK_SELECT_KEY( U8, U8, F16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4",
|
||||
&uniConvertUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4",
|
||||
&uniConvertSecUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2",
|
||||
&uniConvertFstDataToFp32_4x4_2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2",
|
||||
&uniConvertSecDataToFp32_4x4_2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2",
|
||||
&uniConvertUint8SubZpToFp32_4x4_2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2",
|
||||
&uniConvertSecUint8SubZpToFp32_4x4_2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP0", &src0ZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, F16, U8 ):
|
||||
case _PACK_SELECT_KEY( F16, U8, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, U8, U8 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
|
||||
&uniConvertFstDataToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
|
||||
&uniConvertSecDataToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2",
|
||||
&uniConvertUint8SubZpToFp32_4x4_2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2",
|
||||
&uniConvertSecUint8SubZpToFp32_4x4_2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
|
||||
&uniConvertHalfToFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
|
||||
&uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
|
||||
&uniConvBF16toF32_Part1_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
|
||||
&uniExtractOddData_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#undef _PACK_SELECT_KEY
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
break;
|
||||
}
|
||||
#undef _PACK_SELECT_KEY
|
||||
|
||||
OnError:
|
||||
if ( attr[0] )
|
||||
|
|
@ -646,7 +440,6 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_POW_PARAM_NUM,
|
||||
inputs, 2, outputs, 1 );
|
||||
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_POW_PARAM_NUM );
|
||||
|
||||
}
|
||||
}
|
||||
return node;
|
||||
|
|
@ -655,4 +448,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( pow, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -126,8 +126,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
outputScale = attr[0]->asymm.scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
|
|
@ -152,7 +150,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
|
|||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outputScale = 1.0f/outputScale;
|
||||
outputScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
|
|
|
|||
|
|
@ -128,8 +128,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
|
|||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
dstZP = (float)attr[0]->asymm.zero_point;
|
||||
outputScale = attr[0]->asymm.scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
|
|
@ -147,7 +145,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
|
|||
}
|
||||
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outputScale = 1.0f/outputScale;
|
||||
outputScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = (float)attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
|
|
|
|||
|
|
@ -148,8 +148,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
dstScale = attr[0]->asymm.scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
|
|
@ -161,7 +159,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
|
|||
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
dstScale = 1.0f / dstScale;
|
||||
dstScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -35,13 +35,15 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define KERNEL_SOURCE_0 "pre_process_rgb888_planar_0",
|
||||
#define KERNEL_SOURCE_1 "pre_process_rgb888_planar_1",
|
||||
#define KERNEL_SOURCE_2 "pre_process_rgb888_planar_2",
|
||||
#define RGB888_SEP_SOURCE_0 "pre_process_rgb888_planar_sep_0",
|
||||
#define RGB888_SEP_SOURCE_1 "pre_process_rgb888_planar_sep_1",
|
||||
#define RGB888_SEP_SOURCE_2 "pre_process_rgb888_planar_sep_2",
|
||||
#define RGB888_SOURCE_0 "pre_process_rgb888_planar_0",
|
||||
#define RGB888_SOURCE_1 "pre_process_rgb888_planar_1",
|
||||
#define RGB888_SOURCE_2 "pre_process_rgb888_planar_2",
|
||||
|
||||
#define STR(a) #a
|
||||
|
||||
|
|
@ -53,28 +55,48 @@ typedef enum
|
|||
HALF
|
||||
} _internal_scale_e;
|
||||
// Add kernel hashtable here
|
||||
#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE_FLAG ) \
|
||||
(( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | (SCALE_FLAG))
|
||||
#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SEP, SCALE_FLAG ) \
|
||||
(( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | ( SEP << 4 ) | (SCALE_FLAG))
|
||||
|
||||
#define PACK_KERNEL_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
KERNEL_SOURCE_0 }
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, SCALE ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
RGB888_SOURCE_0 }
|
||||
|
||||
#define PACK_KERNEL_SEP_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, SCALE ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
RGB888_SEP_SOURCE_0 }
|
||||
|
||||
#define PACK_KERNEL_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, COPY ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
KERNEL_SOURCE_1 }
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, COPY ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
RGB888_SOURCE_1 }
|
||||
|
||||
#define PACK_KERNEL_SEP_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, COPY ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
RGB888_SEP_SOURCE_1 }
|
||||
|
||||
#define PACK_KERNEL_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, FOUR_OVER_THREE ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
KERNEL_SOURCE_2 }
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, FOUR_OVER_THREE ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
RGB888_SOURCE_2 }
|
||||
|
||||
#define PACK_KERNEL_SEP_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, FOUR_OVER_THREE ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
RGB888_SEP_SOURCE_2 }
|
||||
|
||||
#define PACK_KERNEL_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, HALF ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
KERNEL_SOURCE_2 }
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, HALF ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
RGB888_SOURCE_2 }
|
||||
|
||||
#define PACK_KERNEL_SEP_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, HALF ), \
|
||||
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
|
||||
RGB888_SEP_SOURCE_2 }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
|
@ -98,6 +120,19 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] =
|
|||
|
||||
PACK_KERNEL_4_OVER_3_MAP( U8, U8 ),
|
||||
PACK_KERNEL_HALF_MAP( U8, U8 ),
|
||||
|
||||
PACK_KERNEL_SEP_SCALE_MAP( U8, F16 ),
|
||||
PACK_KERNEL_SEP_SCALE_MAP( U8, I16 ),
|
||||
PACK_KERNEL_SEP_SCALE_MAP( U8, I8 ),
|
||||
PACK_KERNEL_SEP_SCALE_MAP( U8, U8 ),
|
||||
|
||||
PACK_KERNEL_SEP_COPY_MAP( U8, F16 ),
|
||||
PACK_KERNEL_SEP_COPY_MAP( U8, I16 ),
|
||||
PACK_KERNEL_SEP_COPY_MAP( U8, I8 ),
|
||||
PACK_KERNEL_SEP_COPY_MAP( U8, U8 ),
|
||||
|
||||
PACK_KERNEL_SEP_4_OVER_3_MAP( U8, U8 ),
|
||||
PACK_KERNEL_SEP_HALF_MAP( U8, U8 ),
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -105,6 +140,23 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] =
|
|||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
|
||||
|
||||
static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
|
|
@ -121,7 +173,7 @@ static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
|
|||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
|
||||
#define _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
|
|
@ -149,9 +201,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
|
|||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
|
||||
{
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
}
|
||||
else
|
||||
{
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
}
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
|
|
@ -310,9 +369,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
|
|||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
|
||||
{
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
}
|
||||
else
|
||||
{
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
}
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
|
|
@ -406,7 +472,14 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
|
|||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
|
||||
{
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
}
|
||||
else
|
||||
{
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
}
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
|
|
@ -540,6 +613,7 @@ static vsi_status _query_kernel
|
|||
vsi_bool is_4_over_3 = FALSE;
|
||||
vsi_bool is_half_scale = FALSE;
|
||||
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
|
||||
vsi_bool is_rgb888_sep = (vsi_bool)(inputs[1] != NULL);
|
||||
|
||||
is_4_over_3 = (width * 3 == (int32_t)outputs[0]->attr.size[0] * 4) &&
|
||||
(height * 3 == (int32_t)outputs[0]->attr.size[1] * 4);
|
||||
|
|
@ -568,7 +642,7 @@ static vsi_status _query_kernel
|
|||
}
|
||||
}
|
||||
|
||||
key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, scale_type);
|
||||
key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, is_rgb888_sep, scale_type);
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(pre_process_rgb888_planar_kernel_map); i ++ )
|
||||
{
|
||||
|
|
@ -581,8 +655,17 @@ static vsi_status _query_kernel
|
|||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",
|
||||
pre_process_rgb888_planar_kernel_map[i].function_name );
|
||||
kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
|
||||
|
||||
if (is_rgb888_sep)
|
||||
{
|
||||
kernel->info.parameters = _pre_process_rgb888_planar_sep_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def );
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
|
||||
}
|
||||
|
||||
if (enable_copy)
|
||||
{
|
||||
|
|
@ -620,8 +703,9 @@ static vsi_nn_kernel_node_t _setup
|
|||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_PRE_PROCESS_RGB888_PLANAR_PARAM_NUM];
|
||||
vsi_nn_kernel_node_param_t* node_params = NULL;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
|
||||
int32_t width = vsi_nn_kernel_param_get_int32( params, "width" );
|
||||
int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
|
||||
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
|
||||
|
|
@ -630,7 +714,10 @@ static vsi_nn_kernel_node_t _setup
|
|||
float scale = vsi_nn_kernel_param_get_float32( params, "scale" );
|
||||
vsi_bool is_no_range_change = FALSE;
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
input_num = inputs[1] == NULL ? 1 : input_num;
|
||||
param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count;
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
|
|
@ -648,17 +735,19 @@ static vsi_nn_kernel_node_t _setup
|
|||
status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 6;
|
||||
uint32_t index = inputs[1] == NULL ? 4 : 6;
|
||||
uint32_t scalar_index = index;
|
||||
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
|
||||
int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
|
||||
int32_t left = vsi_nn_kernel_param_get_int32( params, "left" );
|
||||
int32_t top = vsi_nn_kernel_param_get_int32( params, "top" );
|
||||
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM,
|
||||
vsi_nn_kernel_node_pack_io( node_params, param_count,
|
||||
inputs, input_num, outputs, output_num );
|
||||
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
|
||||
|
|
@ -670,17 +759,21 @@ static vsi_nn_kernel_node_t _setup
|
|||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[12] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[13] );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, param_count );
|
||||
index = scalar_index;
|
||||
vsi_nn_kernel_scalar_release( &node_params[index++] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[index++] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[index++] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[index++] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[index++] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[index++] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[index++] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[index++] );
|
||||
}
|
||||
}
|
||||
|
||||
vsi_nn_safe_free(node_params);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -150,8 +150,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
outputZP = (float)attr[0]->asymm.zero_point;
|
||||
outputScale = attr[0]->asymm.scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
|
|
@ -176,7 +174,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
|
|||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
outputScale = 1.0f / outputScale;
|
||||
outputScale = 1.0f / attr[0]->asymm.scale;
|
||||
outputZP = (float)attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
|
|
|
|||
|
|
@ -135,8 +135,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
dstScale = attr[0]->asymm.scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
|
|
@ -151,9 +149,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
|
|||
width = width / 3;
|
||||
}
|
||||
|
||||
if (attr[0]->dtype == U8)
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
dstScale = 1.0f / dstScale;
|
||||
dstScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
|
|
|
|||
|
|
@ -130,8 +130,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
dstScale = attr[0]->asymm.scale;
|
||||
width = (uint32_t)(out_shape->data[0]);
|
||||
height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
|
|
@ -141,9 +139,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
|
|||
order1 = 0;
|
||||
}
|
||||
|
||||
if (attr[0]->dtype == U8)
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
dstScale = 1.0f / dstScale;
|
||||
dstScale = 1.0f / attr[0]->asymm.scale;
|
||||
dstZP = attr[0]->asymm.zero_point;
|
||||
}
|
||||
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
dstZP = 0;
|
||||
}
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ typedef enum
|
|||
UP_3X_HALF,
|
||||
UP_4X_HALF,
|
||||
UP_8X_HALF,
|
||||
UP_8X_ALIGN,
|
||||
} _internal_scale_e;
|
||||
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_bilinear_"#_input_type
|
||||
|
|
@ -102,6 +103,12 @@ typedef enum
|
|||
"_SAME_3x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_SAME_8x_upsample_align_corners"), \
|
||||
"resize_bilinear_align_corners" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
|
|
@ -128,6 +135,7 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
|
|||
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_8X_HALF(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_8X_ALIGN(U8, U8),
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -228,11 +236,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
uint32_t out_height;
|
||||
float half_pixel_value = 0.0f;
|
||||
vsi_bool is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size);
|
||||
vsi_bool is_half_pixel_centers = FALSE;
|
||||
vsi_bool is_2x_up_kernel = FALSE;
|
||||
vsi_bool is_3x_up_kernel = FALSE;
|
||||
vsi_bool is_4x_up_kernel = FALSE;
|
||||
vsi_bool is_8x_up_kernel = FALSE;
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
|
|
@ -257,20 +260,20 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
|
||||
if (align_corners && out_width > 1)
|
||||
{
|
||||
scale_factor[0] = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
|
||||
scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_factor[0] = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
|
||||
scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width;
|
||||
}
|
||||
|
||||
if (align_corners && out_height > 1)
|
||||
{
|
||||
scale_factor[1] = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
|
||||
scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_factor[1] = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
|
||||
scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height;
|
||||
}
|
||||
|
||||
if (half_pixel_centers)
|
||||
|
|
@ -282,16 +285,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
half_pixel_value = 0.0f;
|
||||
}
|
||||
|
||||
is_half_pixel_centers = (!align_corners) && (half_pixel_centers);
|
||||
|
||||
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)) && is_half_pixel_centers)
|
||||
{
|
||||
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
|
||||
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
|
||||
is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
|
||||
is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
|
||||
}
|
||||
|
||||
if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
{
|
||||
input_scale = input_attr->asymm.scale;
|
||||
|
|
@ -302,11 +295,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
srcFixPointPos = input_attr->dfp.fl;
|
||||
if (srcFixPointPos >= 0)
|
||||
{
|
||||
input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
|
||||
}
|
||||
else if (srcFixPointPos < 0)
|
||||
{
|
||||
input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos);
|
||||
input_scale = (float)((int64_t)1 << -srcFixPointPos);
|
||||
}
|
||||
inputZP = 0;
|
||||
}
|
||||
|
|
@ -326,11 +319,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
dstFixPointPos = output_attr->dfp.fl;
|
||||
if (dstFixPointPos >= 0)
|
||||
{
|
||||
output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos);
|
||||
output_scale = (float) ((int64_t)1 << dstFixPointPos);
|
||||
}
|
||||
else if (dstFixPointPos < 0)
|
||||
{
|
||||
output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
|
||||
output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
|
||||
}
|
||||
outputZP = 0;
|
||||
}
|
||||
|
|
@ -340,226 +333,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
outputZP = 0;
|
||||
}
|
||||
|
||||
if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
|
||||
{
|
||||
gpu_param.global_scale[0] = 16;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
}
|
||||
else if (is_3x_up_kernel)
|
||||
{
|
||||
gpu_param.global_scale[0] = 15;
|
||||
gpu_param.global_scale[1] = 6;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
if (is_2x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_3x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
|
||||
0x15515515, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x21210110, 0x03323202, // ABin
|
||||
0x2aa2aa2a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
|
||||
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
|
||||
0x05155155, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x54044343, 0x00650554, // ABin
|
||||
0x0a2aa2aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
|
||||
0x55551155, // TCfg
|
||||
0x50501050, // ASelt
|
||||
0x01011010, 0x21212121, // ABin
|
||||
0xaaaa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
|
||||
0x11555511, // TCfg
|
||||
0x10505010, // ASelt
|
||||
0x32320202, 0x03033232, // ABin
|
||||
0x22aaaa22, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
|
||||
0x55115555, // TCfg
|
||||
0x50105050, // ASelt
|
||||
0x43434343, 0x54540404, // ABin
|
||||
0xaa22aaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
|
||||
0x00551155, // TCfg
|
||||
0x00501050, // ASelt
|
||||
0x05055454, 0x00006565, // ABin
|
||||
0x00aa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_4x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_8x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
|
||||
{
|
||||
float dfpScale = input_scale * output_scale;
|
||||
gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
|
||||
|
|
@ -840,7 +618,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
|
||||
{
|
||||
float uint8Scale = 1.0f / output_scale;
|
||||
float uint8ZP_out = (vx_float32)outputZP;
|
||||
float uint8ZP_out = (float)outputZP;
|
||||
gpu_dp_inst_t uniExtact8Bit_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
|
|
@ -1045,11 +823,299 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
goto final;
|
||||
}
|
||||
|
||||
if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel&& !is_8x_up_kernel)
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((out_width + \
|
||||
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
final:
|
||||
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
|
||||
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
|
||||
return status;
|
||||
} /* _resize_bilinear_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
vsi_size_array_t * in_shape = NULL;
|
||||
vsi_nn_kernel_dtype_e input_dtype = F16;
|
||||
uint32_t depth = 0;
|
||||
uint32_t in_width = 0;
|
||||
uint32_t in_height = 0;
|
||||
uint32_t out_width = 0;
|
||||
uint32_t out_height = 0;
|
||||
vsi_bool is_2x_up_kernel = FALSE;
|
||||
vsi_bool is_3x_up_kernel = FALSE;
|
||||
vsi_bool is_4x_up_kernel = FALSE;
|
||||
vsi_bool is_8x_up_kernel = FALSE;
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
in_shape = input_attr->shape;
|
||||
input_dtype = input_attr->dtype;
|
||||
|
||||
in_width = (uint32_t)(in_shape->data[0]);
|
||||
in_height = (uint32_t)(in_shape->data[1]);
|
||||
depth = (uint32_t)(in_shape->data[2]);
|
||||
out_width = (uint32_t)(out_shape->data[0]);
|
||||
out_height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
|
||||
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
|
||||
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
|
||||
is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
|
||||
is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
|
||||
}
|
||||
|
||||
if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
|
||||
{
|
||||
gpu_param.global_scale[0] = 16;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
}
|
||||
else if (is_3x_up_kernel)
|
||||
{
|
||||
gpu_param.global_scale[0] = 15;
|
||||
gpu_param.global_scale[1] = 6;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
|
||||
if (is_2x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_3x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
|
||||
0x15515515, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x21210110, 0x03323202, // ABin
|
||||
0x2aa2aa2a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
|
||||
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
|
||||
0x05155155, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x54044343, 0x00650554, // ABin
|
||||
0x0a2aa2aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
|
||||
0x55551155, // TCfg
|
||||
0x50501050, // ASelt
|
||||
0x01011010, 0x21212121, // ABin
|
||||
0xaaaa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
|
||||
0x11555511, // TCfg
|
||||
0x10505010, // ASelt
|
||||
0x32320202, 0x03033232, // ABin
|
||||
0x22aaaa22, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
|
||||
0x55115555, // TCfg
|
||||
0x50105050, // ASelt
|
||||
0x43434343, 0x54540404, // ABin
|
||||
0xaa22aaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
|
||||
0x00551155, // TCfg
|
||||
0x00501050, // ASelt
|
||||
0x05055454, 0x00006565, // ABin
|
||||
0x00aa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_4x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_8x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
|
||||
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
|
||||
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
|
||||
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
|
||||
0x00000708, // AccumType, ConstantType, and PostShift
|
||||
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
|
||||
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
VSILOGE("input or output's format is not support");
|
||||
status = VSI_FAILURE;
|
||||
goto final;
|
||||
}
|
||||
|
||||
if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
|
||||
{
|
||||
|
|
@ -1071,7 +1137,168 @@ final:
|
|||
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
|
||||
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
|
||||
return status;
|
||||
} /* _resize_bilinear_initializer() */
|
||||
} /* _bilinear_half_pixel_centers_opt_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
vsi_size_array_t * in_shape = NULL;
|
||||
vsi_nn_kernel_dtype_e input_dtype = F16;
|
||||
uint32_t depth = 0;
|
||||
float scale_factor[2] = {0};
|
||||
uint32_t in_width = 0;
|
||||
uint32_t in_height = 0;
|
||||
uint32_t out_width = 0;
|
||||
uint32_t out_height = 0;
|
||||
vsi_bool is_8x_align_corners = FALSE;
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
in_shape = input_attr->shape;
|
||||
input_dtype = input_attr->dtype;
|
||||
|
||||
in_width = (uint32_t)(in_shape->data[0]);
|
||||
in_height = (uint32_t)(in_shape->data[1]);
|
||||
depth = (uint32_t)(in_shape->data[2]);
|
||||
out_width = (uint32_t)(out_shape->data[0]);
|
||||
out_height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
if (out_width > 1)
|
||||
{
|
||||
scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width;
|
||||
}
|
||||
|
||||
if (out_height > 1)
|
||||
{
|
||||
scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height;
|
||||
}
|
||||
|
||||
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
|
||||
{
|
||||
is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] = 0.125f);
|
||||
}
|
||||
|
||||
if (is_8x_align_corners)
|
||||
{
|
||||
gpu_param.global_scale[0] = 2;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
|
||||
if (is_8x_align_corners)
|
||||
{
|
||||
gpu_dp_inst_t uniBilinear_8x_l10_4x8 = {{
|
||||
0x55555505, 0x55555555, // TCfg
|
||||
0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x00000838, 0x01070731, 0x02060e2a, 0x03051523,
|
||||
0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniBilinear_8x_l11_4x8 = {{
|
||||
0x55555505, 0x55555555, // TCfg
|
||||
0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x00000838, 0x01070731, 0x02060e2a, 0x03051523,
|
||||
0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniBilinear_8x_l20_4x8 = {{
|
||||
0x55555505, 0x55555555, // TCfg
|
||||
0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e,
|
||||
0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniBilinear_8x_l21_4x8 = {{
|
||||
0x55555505, 0x55555555, // TCfg
|
||||
0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e,
|
||||
0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniBilinear_8x_l30_4x8 = {{
|
||||
0x55555505, 0x55555555, // TCfg
|
||||
0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19,
|
||||
0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniBilinear_8x_l31_4x8 = {{
|
||||
0x55555505, 0x55555555, // TCfg
|
||||
0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19,
|
||||
0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniBilinear_8x_l40_4x8 = {{
|
||||
0x55555505, 0x55555555, // TCfg
|
||||
0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x00002020, 0x041c041c, 0x08180818, 0x0c140c14,
|
||||
0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniBilinear_8x_l41_4x8 = {{
|
||||
0x55555505, 0x55555555, // TCfg
|
||||
0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x00002020, 0x041c041c, 0x08180818, 0x0c140c14,
|
||||
0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l10_4x8", &uniBilinear_8x_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l11_4x8", &uniBilinear_8x_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l20_4x8", &uniBilinear_8x_l20_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l21_4x8", &uniBilinear_8x_l21_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l30_4x8", &uniBilinear_8x_l30_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l31_4x8", &uniBilinear_8x_l31_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l40_4x8", &uniBilinear_8x_l40_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l41_4x8", &uniBilinear_8x_l41_4x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
VSILOGE("input or output's format is not support");
|
||||
status = VSI_FAILURE;
|
||||
goto final;
|
||||
}
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((in_width + \
|
||||
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (in_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
final:
|
||||
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
|
||||
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
|
||||
return status;
|
||||
} /* _bilinear_align_corners_opt_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
|
|
@ -1098,19 +1325,46 @@ static vsi_status _query_kernel
|
|||
vx_kernel_initialize_f initializer = _resize_bilinear_initializer;
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
vsi_bool is_2x_upsample =(2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
|
||||
&& (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
|
||||
vsi_bool is_3x_upsample =(3 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
|
||||
&& (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
|
||||
vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
|
||||
&& (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
|
||||
vsi_bool is_8x_upsample =(8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
|
||||
&& (8 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
|
||||
float width_scale = 0;
|
||||
float height_scale = 0;
|
||||
vsi_size_t input_width = inputs[0]->attr.size[0];
|
||||
vsi_size_t input_height = inputs[0]->attr.size[1];
|
||||
vsi_size_t output_width = outputs[0]->attr.size[0];
|
||||
vsi_size_t output_height = outputs[0]->attr.size[1];
|
||||
vsi_bool is_2x_upsample =(2 * input_width == output_width) \
|
||||
&& (2 * input_height == output_height);
|
||||
vsi_bool is_3x_upsample =(3 * input_width == output_width) \
|
||||
&& (3 * input_height == output_height);
|
||||
vsi_bool is_4x_upsample =(4 * input_width == output_width) \
|
||||
&& (4 * input_height == output_height);
|
||||
vsi_bool is_8x_upsample =(8 * input_width == output_width) \
|
||||
&& (8 * input_height == output_height);
|
||||
vsi_bool is_8x_align_corners = FALSE;
|
||||
_internal_scale_e scale_flag = UP;
|
||||
|
||||
if (align_corners && outputs[0]->attr.size[0] > 1)
|
||||
{
|
||||
width_scale = ((float)(input_width - 1) * 1.0f) / (float)(output_width - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
width_scale = ((float)input_width * 1.0f) / (float)output_width;
|
||||
}
|
||||
|
||||
if (align_corners && output_height > 1)
|
||||
{
|
||||
height_scale = ((float)(input_height - 1) * 1.0f) / (float)(output_height - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
height_scale = ((float)input_height * 1.0f) / (float)output_height;
|
||||
}
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
is_8x_align_corners = (vsi_bool)( width_scale == 0.125f && height_scale == 0.125f && in_dtype == U8 );
|
||||
|
||||
is_2x_upsample &= (in_dtype == U8);
|
||||
is_3x_upsample &= (in_dtype == U8);
|
||||
is_4x_upsample &= (in_dtype == U8);
|
||||
|
|
@ -1121,18 +1375,27 @@ static vsi_status _query_kernel
|
|||
if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
|
||||
{
|
||||
scale_flag = UP_2X_HALF;
|
||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||
}
|
||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
|
||||
{
|
||||
scale_flag = UP_3X_HALF;
|
||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||
}
|
||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
|
||||
{
|
||||
scale_flag = UP_4X_HALF;
|
||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||
}
|
||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
|
||||
{
|
||||
scale_flag = UP_8X_HALF;
|
||||
initializer = _bilinear_half_pixel_centers_opt_initializer;
|
||||
}
|
||||
else if (is_same_type && (align_corners) && (!half_pixel_centers) && is_8x_align_corners)
|
||||
{
|
||||
scale_flag = UP_8X_ALIGN;
|
||||
initializer = _bilinear_align_corners_opt_initializer;
|
||||
}
|
||||
else if (is_same_type && is_evis2)
|
||||
{
|
||||
|
|
@ -1240,20 +1503,20 @@ static vsi_nn_tensor_t* _create_scale_tensor
|
|||
|
||||
if (align_corners && width > 1)
|
||||
{
|
||||
width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1);
|
||||
width_scale = ((float)(input_width - 1) * 1.0f) / (float)(width - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)width;
|
||||
width_scale = ((float)input_width * 1.0f) / (float)width;
|
||||
}
|
||||
|
||||
if (align_corners && height > 1)
|
||||
{
|
||||
height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(height - 1);
|
||||
height_scale = ((float)(input_height - 1) * 1.0f) / (float)(height - 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)height;
|
||||
height_scale = ((float)input_height * 1.0f) / (float)height;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1273,7 +1536,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
|
|||
int32_t h0 = 0;
|
||||
if (half_pixel_centers)
|
||||
{
|
||||
input_h = ((vx_float32)y + 0.5f) * height_scale - 0.5f;
|
||||
input_h = ((float)y + 0.5f) * height_scale - 0.5f;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
@ -1291,7 +1554,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
|
|||
float br = 0.0f;
|
||||
if (half_pixel_centers)
|
||||
{
|
||||
input_w = ((vx_float32)x + 0.5f) * width_scale - 0.5f;
|
||||
input_w = ((float)x + 0.5f) * width_scale - 0.5f;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
|||
|
|
@ -51,6 +51,15 @@ __BEGIN_DECLS
|
|||
"_"STR(UP_SCALE)"x_upsample_half_pixel_centers"), \
|
||||
"resize_bilinear_nhwc" }
|
||||
|
||||
#define BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \
|
||||
(( IN_DTYPE ) | ( OUT_DTYPE << 8) | (UP_SCALE << 16))
|
||||
|
||||
#define BILINEAR_NHWC_BOUND_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \
|
||||
{ BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_nhwc_bound_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_"STR(UP_SCALE)"x"), \
|
||||
"resize_bilinear_nhwc_bound" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
|
|
@ -65,6 +74,12 @@ static const _kernel_map_type _resize_bilinear_nhwc_kernel_map[] =
|
|||
BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 4),
|
||||
};
|
||||
|
||||
static const _kernel_map_type _bilinear_nhwc_bound_kernel_map[] =
|
||||
{
|
||||
BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 2),
|
||||
BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 3),
|
||||
BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 4),
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
|
|
@ -81,6 +96,14 @@ static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] =
|
|||
#define SCALAR_ALIGN_CORNERS (2)
|
||||
#define SCALAR_HALF_PIXEL (3)
|
||||
|
||||
static vx_param_description_t _bilinear_nhwc_bound_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _BILINEAR_NHWC_BOUND_PARAM_NUM _cnt_of_array( _bilinear_nhwc_bound_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
|
|
@ -382,50 +405,193 @@ final:
|
|||
return status;
|
||||
} /* _resize_bilinear_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_bilinear_nhwc_bound_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||
vsi_size_array_t * in_shape = NULL;
|
||||
vsi_size_array_t * out_shape = NULL;
|
||||
uint32_t x_coord[2] = {0};
|
||||
uint32_t in_width;
|
||||
uint32_t in_height;
|
||||
uint32_t out_width;
|
||||
uint32_t out_height;
|
||||
vsi_bool is_2x_up_kernel = FALSE;
|
||||
vsi_bool is_3x_up_kernel = FALSE;
|
||||
vsi_bool is_4x_up_kernel = FALSE;
|
||||
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
in_shape = input_attr->shape;
|
||||
out_shape = output_attr->shape;
|
||||
|
||||
in_width = (uint32_t)(in_shape->data[0]);
|
||||
in_height = (uint32_t)(in_shape->data[1]);
|
||||
out_width = (uint32_t)(out_shape->data[0]);
|
||||
out_height = (uint32_t)(out_shape->data[1]);
|
||||
|
||||
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
|
||||
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
|
||||
is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
|
||||
|
||||
|
||||
if (is_2x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize_x2_nhwc2_0_4x8 = {{
|
||||
0x55555511, 0x55555555, // TCfg
|
||||
0x46104000, 0x3a48829c, 0x4882acca, 0xc4acca3a, 0xbd4e5b50, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x000c0004, 0x09030301, 0x03090103, 0x03090103,
|
||||
0x09030301, 0x09030301, 0x03090103, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_param.global_scale[0] = 2;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
|
||||
x_coord[0] = (x_coord[1] * 2 - 1) >> 2;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x2_nhwc2_0_4x8", &uniResize_x2_nhwc2_0_4x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_3x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize_x3_nhwc2_l10_4x4 = {{
|
||||
0x05055511, // TCfg
|
||||
0x04045010, // ASelt
|
||||
0x31310000, 0x00330022, // ABin
|
||||
0x0a0aaa22, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x00005556, 0x00002aab, 0x38e41c72, 0x1c720e39,
|
||||
0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
gpu_param.global_scale[0] = 3;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
|
||||
x_coord[0] = (x_coord[1] - 1) / 6 * 2;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l10_4x4", &uniResize_x3_nhwc2_l10_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_4x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize_x4_nhwc2_l00_4x8 = {{
|
||||
0x55555511, 0x55555555, // TCfg
|
||||
0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
|
||||
0x00000706, // AccumType, ConstantType, and PostShift
|
||||
0x00280018, 0x190f0f09, 0x23051503, 0x23051503,
|
||||
0x05230315, 0x05230315, 0x0f19090f, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize_x4_nhwc2_l10_4x8 = {{
|
||||
0x55555511, 0x55555555, // TCfg
|
||||
0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
|
||||
0x00000706, // AccumType, ConstantType, and PostShift
|
||||
0x00380008, 0x23150503, 0x31070701, 0x31070701,
|
||||
0x07310107, 0x07310107, 0x15230305, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
|
||||
x_coord[0] = ((x_coord[1] - 3) >> 3) * 2;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l00_4x8", &uniResize_x4_nhwc2_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l10_4x8", &uniResize_x4_nhwc2_l10_4x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else
|
||||
{
|
||||
VSILOGE("input or output's format is not support");
|
||||
status = VSI_FAILURE;
|
||||
goto final;
|
||||
}
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((out_height + \
|
||||
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = 1;
|
||||
gpu_param.dim = 2;
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "x_coord", &x_coord);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
final:
|
||||
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
|
||||
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
|
||||
|
||||
return status;
|
||||
} /* _bilinear_nhwc_bound_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t align_corners,
|
||||
int32_t half_pixel_centers,
|
||||
uint32_t up_scale
|
||||
const uint32_t hashkey,
|
||||
uint32_t kernel_id
|
||||
)
|
||||
{
|
||||
vx_kernel_initialize_f initializer = NULL;
|
||||
vx_param_description_t * param_def;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _resize_bilinear_nhwc_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_map );
|
||||
vx_param_description_t * param_def = _resize_bilinear_nhwc_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _resize_bilinear_nhwc_initializer;
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
const _kernel_map_type* kernel_map;
|
||||
size_t kernel_map_size;
|
||||
size_t param_size;
|
||||
uint32_t i = 0;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
in_dtype = in_dtype == I8 ? U8 : in_dtype;
|
||||
out_dtype = out_dtype == I8 ? U8 : out_dtype;
|
||||
|
||||
key = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers, align_corners, up_scale );
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
switch( kernel_id )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
case 0:
|
||||
initializer = _resize_bilinear_nhwc_initializer;
|
||||
kernel_map = _resize_bilinear_nhwc_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_map );
|
||||
param_def = _resize_bilinear_nhwc_kernel_param_def;
|
||||
param_size = _RESIZE_BILINEAR_NHWC_PARAM_NUM;
|
||||
break;
|
||||
case 1:
|
||||
initializer = _bilinear_nhwc_bound_initializer;
|
||||
kernel_map = _bilinear_nhwc_bound_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _bilinear_nhwc_bound_kernel_map );
|
||||
param_def = _bilinear_nhwc_bound_kernel_param_def;
|
||||
param_size = _BILINEAR_NHWC_BOUND_PARAM_NUM;
|
||||
break;
|
||||
default:
|
||||
VSI_ASSERT( FALSE );
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
for( i = 0; i < kernel_map_size; i ++ )
|
||||
{
|
||||
if( kernel_map[i].key == hashkey )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < kernel_map_size )
|
||||
if( i < kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_def_size;
|
||||
kernel->info.numParams = (uint32_t)param_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
|
|
@ -453,7 +619,8 @@ static vsi_nn_kernel_node_t _setup
|
|||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_param_t node0_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_param_t node1_params[_BILINEAR_NHWC_BOUND_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
|
||||
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
|
||||
|
|
@ -463,8 +630,14 @@ static vsi_nn_kernel_node_t _setup
|
|||
float scale_y = (float)outputs[0]->attr.size[2] / (float)inputs[0]->attr.size[2];
|
||||
float up_scale = scale_x == scale_y ? scale_x : 0;
|
||||
uint32_t rank = inputs[0]->attr.dim_num;
|
||||
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
|
||||
vsi_nn_kernel_t * ikernels[2] = { NULL };
|
||||
uint32_t hashkeys[2] = {0};
|
||||
uint32_t i = 0;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
|
||||
if (!is_same_type || depth != 2 || rank < 3 ||
|
||||
(up_scale != 2.0f && up_scale != 3.0f && up_scale != 4.0f))
|
||||
|
|
@ -472,8 +645,24 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs,
|
||||
align_corners, half_pixel_centers, (uint32_t)up_scale);
|
||||
ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
|
||||
// Assign unique_id
|
||||
ikernels[0]->unique_id = kernel->unique_id;
|
||||
ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
|
||||
// Assign unique_id
|
||||
ikernels[1]->unique_id = kernel->unique_id;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
hashkeys[0] = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers,
|
||||
align_corners, (vsi_size_t)up_scale );
|
||||
hashkeys[1] = BILINEAR_NHWC_BOUND_HASH_KEY( in_dtype, out_dtype, (vsi_size_t)up_scale );
|
||||
|
||||
status = _query_kernel( ikernels[0], hashkeys[0], 0);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
status = _query_kernel( kernel, hashkeys[1], 1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
shapes[0][0] = depth * inputs[0]->attr.size[1];
|
||||
shapes[0][1] = inputs[0]->attr.size[2];
|
||||
|
|
@ -491,26 +680,41 @@ static vsi_nn_kernel_node_t _setup
|
|||
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], shapes[1], rank );
|
||||
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
|
||||
reshape_tensors, input_num, &reshape_tensors[1], output_num );
|
||||
node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
|
||||
node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
|
||||
// resize bilinear
|
||||
node = vsi_nn_kernel_create_node( graph, ikernels[0] );
|
||||
VSI_ASSERT( node != NULL );
|
||||
vsi_nn_kernel_node_pack_io( node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
|
||||
reshape_tensors, input_num, &reshape_tensors[1], output_num );
|
||||
node0_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
|
||||
node0_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node0_params[SCALAR_ALIGN_CORNERS] );
|
||||
vsi_nn_kernel_scalar_release( &node0_params[SCALAR_HALF_PIXEL] );
|
||||
vsi_nn_kernel_node_release( &node );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
|
||||
// update bound for output tensor
|
||||
memcpy( &attr, &(reshape_tensors[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.size[0] = 1;
|
||||
attr.size[1] = 1;
|
||||
attr.dim_num = 2;
|
||||
reshape_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
VSI_ASSERT( node != NULL );
|
||||
vsi_nn_kernel_node_pack_io( node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM,
|
||||
reshape_tensors, 2, &reshape_tensors[2], 1 );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM );
|
||||
|
||||
final:
|
||||
for( i = 0; i < 2; i ++ )
|
||||
{
|
||||
if( ikernels[i] )
|
||||
{
|
||||
vsi_nn_kernel_release( &ikernels[i] );
|
||||
}
|
||||
}
|
||||
|
||||
vsi_safe_release_tensor(reshape_tensors[0]);
|
||||
vsi_safe_release_tensor(reshape_tensors[1]);
|
||||
vsi_safe_release_tensor(reshape_tensors[2]);
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
|
|
|||
|
|
@ -118,7 +118,7 @@ static vsi_status get_scatter_nd_tensor_reshape_size
|
|||
return status;
|
||||
}
|
||||
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
newDim[0] = 0;
|
||||
for(i = 0; i < dims_num; ++i)
|
||||
|
|
|
|||
|
|
@ -207,7 +207,7 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
|
|||
return status;
|
||||
}
|
||||
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
|
||||
|
||||
newDim[0] = 0;
|
||||
for(i = 0; i < dims_num; ++i)
|
||||
|
|
|
|||
|
|
@ -75,10 +75,24 @@ static const _kernel_map_type _select_kernel_map[] =
|
|||
PACK_KERNEL_MAP(I8, U8, U8, U8),
|
||||
PACK_KERNEL_MAP(I8, I16, I16, I16),
|
||||
PACK_KERNEL_MAP(I8, F16, F16, F16),
|
||||
PACK_KERNEL_MAP(I8, F16, U8, F16),
|
||||
PACK_KERNEL_MAP(I8, U8, F16, F16),
|
||||
PACK_KERNEL_MAP(I8, F16, I8, F16),
|
||||
PACK_KERNEL_MAP(I8, I8, F16, F16),
|
||||
PACK_KERNEL_MAP(I8, F16, I16, F16),
|
||||
PACK_KERNEL_MAP(I8, I16, F16, F16),
|
||||
PACK_KERNEL_MAP(I8, F16, F16, U8),
|
||||
PACK_KERNEL_MAP_2D(I8, I8, I8, I8),
|
||||
PACK_KERNEL_MAP_2D(I8, U8, U8, U8),
|
||||
PACK_KERNEL_MAP_2D(I8, I16, I16, I16),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, F16, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, U8, F16, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, U8, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, I8, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, I8, F16, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, I16, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, I16, F16, F16),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, F16, U8),
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -142,7 +156,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
|
||||
|
||||
if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input0_fl = input0_attr->dfp.fl;
|
||||
if (input0_fl > 0)
|
||||
|
|
@ -154,13 +168,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
input0Scale = (float)((int64_t)1 << -input0_fl);
|
||||
}
|
||||
}
|
||||
else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input0Scale = input0_attr->asymm.scale;
|
||||
input0Zp = input0_attr->asymm.zero_point;
|
||||
}
|
||||
|
||||
if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
input1_fl = input1_attr->dfp.fl;
|
||||
if (input1_fl > 0)
|
||||
|
|
@ -172,13 +186,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
input1Scale = (float)((int64_t)1 << -input1_fl);
|
||||
}
|
||||
}
|
||||
else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input1Scale = input1_attr->asymm.scale;
|
||||
input1Zp = input1_attr->asymm.zero_point;
|
||||
}
|
||||
|
||||
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
output_fl = output_attr->dfp.fl;
|
||||
if (output_fl > 0)
|
||||
|
|
@ -190,7 +204,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
outputScale = (float)((int64_t)1 << -output_fl);
|
||||
}
|
||||
}
|
||||
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outputScale = output_attr->asymm.scale;
|
||||
outputZP = output_attr->asymm.zero_point;
|
||||
|
|
@ -203,13 +217,10 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
|
||||
output_shape = output_attr->shape;
|
||||
gpu_param.dim = output_shape->size < 3 ? 2 : 3;
|
||||
gpu_param.global_offset[0] = 0;
|
||||
gpu_param.global_offset[1] = 0;
|
||||
gpu_param.global_offset[2] = 0;
|
||||
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
|
|
@ -218,83 +229,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
(output_shape->data[2] + gpu_param.global_scale[2] - 1)
|
||||
/ gpu_param.global_scale[2] : 1;
|
||||
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
case _PACK_SELECT_KEY( I8, I8, I8 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, I16 ):
|
||||
{
|
||||
gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvIntIn0toDst_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConvIntIn1toDst_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if (input0_fl >= output_fl)
|
||||
{
|
||||
uint8_t postshift = (uint8_t)gpu_min(input0_fl - output_fl, MAX_POST_SHIFT_BITS);
|
||||
uniConvIntIn0toDst_2x8.data[7] = uniConvIntIn0toDst_2x8.data[7] | (postshift & 0x1F);
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t idx = 0;
|
||||
uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input0_fl), MAX_MULTIPLIER_NUM);
|
||||
for (idx = 8; idx < 16; idx ++)
|
||||
{
|
||||
uniConvIntIn0toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (input1_fl >= output_fl)
|
||||
{
|
||||
uint8_t postshift = (uint8_t)gpu_min(input1_fl - output_fl, MAX_POST_SHIFT_BITS);
|
||||
uniConvIntIn1toDst_2x8.data[7] = uniConvIntIn1toDst_2x8.data[7] | (postshift & 0x1F);
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t idx = 0;
|
||||
uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input1_fl), MAX_MULTIPLIER_NUM);
|
||||
for (idx = 8; idx < 16; idx ++)
|
||||
{
|
||||
uniConvIntIn1toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff);
|
||||
}
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvIntIn0toDst_2x8", &uniConvIntIn0toDst_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvIntIn1toDst_2x8", &uniConvIntIn1toDst_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, F16, F16 ):
|
||||
{
|
||||
gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
|
||||
|
|
@ -312,61 +248,66 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( I8, I8, I8 ):
|
||||
case _PACK_SELECT_KEY( I16, I16, I16 ):
|
||||
case _PACK_SELECT_KEY( U8, U8, U8 ):
|
||||
case _PACK_SELECT_KEY( I8, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( U8, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( I16, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, U8, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, I8, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, I16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, F16, U8 ):
|
||||
{
|
||||
uint32_t idx = 0;
|
||||
gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In0_2x8 = {{
|
||||
0x99999999, // TCfg
|
||||
0x44444444, // ASelt
|
||||
uint32_t multAndoutZP0[2] = {0};
|
||||
uint32_t multAndoutZP1[2] = {0};
|
||||
gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In1_2x8 = {{
|
||||
0x99999999, // TCfg
|
||||
gpu_dp_inst_t uniU8MulAndPostShift0_Lo_2x8 = {{
|
||||
0xdddddddd, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x13121110, 0x17161514, // ABin
|
||||
0x11111111, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniU8AddZP_2x8 = {{
|
||||
0x55555555, // TCfg
|
||||
gpu_dp_inst_t uniU8MulAndPostShift1_Lo_2x8 = {{
|
||||
0xdddddddd, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x13121110, 0x17161514, // ABin
|
||||
0x11111111, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
uniU8SubZP_MulM_PStoF16In0_2x8.data[7] |= (in0_postShift & 0x1F);
|
||||
uniU8SubZP_MulM_PStoF16In1_2x8.data[7] |= (in1_postShift & 0x1F);
|
||||
multAndoutZP0[0] = (uint32_t)(in0_M0);
|
||||
multAndoutZP0[1] = (uint32_t)((outputZP << in0_postShift) - input0Zp * in0_M0);
|
||||
multAndoutZP1[0] = (uint32_t)(in1_M0);
|
||||
multAndoutZP1[1] = (uint32_t)((outputZP << in1_postShift) - input1Zp * in1_M0);
|
||||
|
||||
for (idx = 8; idx < 16; idx ++)
|
||||
{
|
||||
uniU8SubZP_MulM_PStoF16In0_2x8.data[idx] = (vx_uint32)(in0_M0 << 16) | in0_M0;
|
||||
uniU8SubZP_MulM_PStoF16In1_2x8.data[idx] = (vx_uint32)(in1_M0 << 16) | in1_M0;
|
||||
}
|
||||
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift0_Lo_2x8, in0_postShift );
|
||||
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift1_Lo_2x8, in1_postShift );
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8SubZP_MulM_PStoF16In0_2x8", &uniU8SubZP_MulM_PStoF16In0_2x8 );
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8SubZP_MulM_PStoF16In1_2x8", &uniU8SubZP_MulM_PStoF16In1_2x8 );
|
||||
"uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8AddZP_2x8", &uniU8AddZP_2x8 );
|
||||
"uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift0_Lo_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"input0Zp", &input0Zp );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"input1Zp", &input1Zp );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"outputZP", &outputZP );
|
||||
"uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift1_Lo_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
|
|
@ -501,4 +442,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( select, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,6 @@
|
|||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define _SLICE_KERNEL_SOURCE "slice"
|
||||
#define _SLICE_KERNEL_NAME CVIVANTE_NAMESPACE("evis.slice")
|
||||
|
||||
// Add kernel hashtable here
|
||||
|
|
@ -50,30 +49,30 @@ __BEGIN_DECLS
|
|||
#define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D, _SAMEFL) \
|
||||
(( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D << 1) | (_SAMEFL))
|
||||
|
||||
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
|
||||
#define PACK_KERNEL_MAP_3D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \
|
||||
SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
|
||||
SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
|
||||
|
||||
#define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D")
|
||||
|
||||
#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
|
||||
#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \
|
||||
SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
|
||||
SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
|
||||
|
||||
#define SLICE_SH_KERNEL_SAMEFL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL")
|
||||
|
||||
#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
|
||||
#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \
|
||||
SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
|
||||
SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
|
||||
|
||||
#define SLICE_SH_KERNEL_SAMEFL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL_2D")
|
||||
|
||||
#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
|
||||
#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
|
||||
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ), \
|
||||
SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
|
||||
SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
|
@ -85,21 +84,33 @@ __BEGIN_DECLS
|
|||
static const _kernel_map_type _slice_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP( I8, I32, I8, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_3D( F16, I32, F16 ),
|
||||
PACK_KERNEL_MAP_3D( F16, I32, I8 ),
|
||||
PACK_KERNEL_MAP_3D( F16, I32, U8 ),
|
||||
PACK_KERNEL_MAP_3D( F16, I32, I16 ),
|
||||
PACK_KERNEL_MAP_3D( I8, I32, F16 ),
|
||||
PACK_KERNEL_MAP_3D( U8, I32, F16 ),
|
||||
PACK_KERNEL_MAP_3D( I16, I32, F16 ),
|
||||
PACK_KERNEL_MAP_3D( I16, I32, I16 ),
|
||||
PACK_KERNEL_MAP_3D( U8, I32, U8 ),
|
||||
PACK_KERNEL_MAP_3D( I8, I32, I8 ),
|
||||
|
||||
PACK_KERNEL_MAP_2D( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_2D( I8, I32, I8, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_2D( F16, I32, F16 ),
|
||||
PACK_KERNEL_MAP_2D( I16, I32, I16 ),
|
||||
PACK_KERNEL_MAP_2D( F16, I32, I8 ),
|
||||
PACK_KERNEL_MAP_2D( F16, I32, U8 ),
|
||||
PACK_KERNEL_MAP_2D( F16, I32, I16 ),
|
||||
PACK_KERNEL_MAP_2D( I8, I32, F16 ),
|
||||
PACK_KERNEL_MAP_2D( U8, I32, F16 ),
|
||||
PACK_KERNEL_MAP_2D( I16, I32, F16 ),
|
||||
PACK_KERNEL_MAP_2D( U8, I32, U8 ),
|
||||
PACK_KERNEL_MAP_2D( I8, I32, I8 ),
|
||||
|
||||
PACK_KERNEL_MAP_SAMEFL( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_SAMEFL( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_SAMEFL( I16, I32, I16 ),
|
||||
PACK_KERNEL_MAP_SAMEFL( U8, I32, U8 ),
|
||||
|
||||
PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_SAMEFL_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16 ),
|
||||
PACK_KERNEL_MAP_SAMEFL_2D( U8, I32, U8 ),
|
||||
};
|
||||
|
||||
#define _INPUT_NUM (2)
|
||||
|
|
@ -201,18 +212,16 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
|
|||
scaleOut = output_attr->asymm.scale;
|
||||
}
|
||||
|
||||
if ((F16 == input_dtype)
|
||||
|| (I16 == input_dtype)
|
||||
|| (BF16 == input_dtype)
|
||||
)
|
||||
if ((I8 == input_dtype && input_dtype == output_dtype ) ||
|
||||
(U8 == input_dtype && input_dtype == output_dtype ) )
|
||||
{
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[0] = 16;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_scale[0] = 16;
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1416,31 +1416,42 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create
|
|||
switch( attr->quant )
|
||||
{
|
||||
case VSI_NN_KERNEL_QUANT_DFP:
|
||||
{
|
||||
{
|
||||
int8_t fl = 0;
|
||||
status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_FIXED_POINT_POS,
|
||||
&fl, sizeof(int8_t));
|
||||
CHECK_STATUS( status );
|
||||
attr->dfp.fl = (int32_t)fl;
|
||||
if (fl >= 0) {
|
||||
attr->scale = 1.0f / ((float)((int64_t)1 << fl));
|
||||
} else {
|
||||
attr->scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
break;
|
||||
} break;
|
||||
case VSI_NN_KERNEL_QUANT_ASYMM:
|
||||
{
|
||||
status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_ZERO_POINT,
|
||||
&(attr->asymm.zero_point), sizeof(int32_t));
|
||||
CHECK_STATUS( status );
|
||||
status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_SCALE,
|
||||
&(attr->asymm.scale), sizeof(float));
|
||||
CHECK_STATUS( status );
|
||||
{
|
||||
status = vxQueryTensor((vx_tensor)tensor,
|
||||
VX_TENSOR_ZERO_POINT,
|
||||
&(attr->asymm.zero_point),
|
||||
sizeof(int32_t));
|
||||
CHECK_STATUS(status);
|
||||
status = vxQueryTensor((vx_tensor)tensor,
|
||||
VX_TENSOR_SCALE,
|
||||
&(attr->asymm.scale),
|
||||
sizeof(float));
|
||||
CHECK_STATUS(status);
|
||||
// Reset scale to 1e-8
|
||||
if( (attr->asymm.scale - 0.f) < 1e-8 )
|
||||
{
|
||||
if ((attr->asymm.scale - 0.f) < 1e-8)
|
||||
{
|
||||
attr->asymm.scale = (float)1e-8;
|
||||
attr->asymm.zero_point = 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
attr->scale = attr->asymm.scale;
|
||||
attr->zero_point = attr->asymm.zero_point;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
attr->scale = 1.0f;
|
||||
break;
|
||||
}
|
||||
return attr;
|
||||
|
|
|
|||
|
|
@ -189,6 +189,16 @@ static float celu_eval(float x, vsi_nn_kernel_lut_params *lut_param)
|
|||
return positive + negative;
|
||||
}
|
||||
|
||||
static float rcp_eval(float x)
|
||||
{
|
||||
return 1.0f / x;
|
||||
}
|
||||
|
||||
static float softsign_eval(float x)
|
||||
{
|
||||
return x / (1 + vsi_abs(x));
|
||||
}
|
||||
|
||||
static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
|
||||
{
|
||||
float result = 0;
|
||||
|
|
@ -245,6 +255,12 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
|
|||
case VSI_NN_KERNEL_LUT_CELU:
|
||||
result = celu_eval(data, lut_param);
|
||||
break;
|
||||
case VSI_NN_KERNEL_LUT_RCP:
|
||||
result = rcp_eval(data);
|
||||
break;
|
||||
case VSI_NN_KERNEL_LUT_SOFTSIGN:
|
||||
result = softsign_eval(data);
|
||||
break;
|
||||
default:
|
||||
VSILOGE( "unsupported activation function:%d", lut_param->act_type );
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -133,5 +133,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu)
|
|||
REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(matrixmul)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(celu)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(rcp)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(softsign)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_bilinear)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_nearest)
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -146,6 +146,8 @@ REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf, VSI_NN_KERNEL_LUT_ERF )
|
|||
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras, VSI_NN_KERNEL_LUT_RELU_KERAS )
|
||||
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip, VSI_NN_KERNEL_LUT_CLIP )
|
||||
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( celu, VSI_NN_KERNEL_LUT_CELU )
|
||||
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( rcp, VSI_NN_KERNEL_LUT_RCP )
|
||||
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( softsign, VSI_NN_KERNEL_LUT_SOFTSIGN )
|
||||
|
||||
#undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,152 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2021 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
#define REGISTER_SOFTMAX_OPENVX_KERNEL( kernel_name ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
); \
|
||||
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
|
||||
static vsi_nn_kernel_node_t _##kernel_name##setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num,\
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
)
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vx_node node = NULL;
|
||||
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
|
||||
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
|
||||
int32_t type = vsi_nn_kernel_param_get_int32( params, "type" );
|
||||
|
||||
#ifdef VX_SCALE_EXTRA_PARAMETER_SUPPORT
|
||||
vx_nn_scale_params_ext_t param;
|
||||
param.align_corners = align_corners;
|
||||
param.half_pixel_centers = half_pixel_centers;
|
||||
switch (type)
|
||||
{
|
||||
case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
|
||||
param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
|
||||
break;
|
||||
case VSI_NN_INTERPOLATION_BILINEAR:
|
||||
param.base.type = VX_INTERPOLATION_BILINEAR;
|
||||
break;
|
||||
case VSI_NN_INTERPOLATION_AREA:
|
||||
param.base.type = VX_INTERPOLATION_AREA;
|
||||
break;
|
||||
default:
|
||||
param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
|
||||
}
|
||||
node = vxTensorScaleNode( graph->g,
|
||||
inputs[0]->t,
|
||||
(vx_nn_scale_params)(¶m),
|
||||
sizeof(vx_nn_scale_params_ext_t),
|
||||
outputs[0]->t );
|
||||
#else
|
||||
vx_nn_scale_params_t param;
|
||||
if (align_corners || half_pixel_centers)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
switch (type)
|
||||
{
|
||||
case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
|
||||
param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
|
||||
break;
|
||||
case VSI_NN_INTERPOLATION_BILINEAR:
|
||||
param.type = VX_INTERPOLATION_BILINEAR;
|
||||
break;
|
||||
case VSI_NN_INTERPOLATION_AREA:
|
||||
param.type = VX_INTERPOLATION_AREA;
|
||||
break;
|
||||
default:
|
||||
param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
|
||||
break;
|
||||
}
|
||||
|
||||
node = vxTensorScaleNode( graph->g,
|
||||
inputs[0]->t,
|
||||
¶m,
|
||||
sizeof(param),
|
||||
outputs[0]->t );
|
||||
#endif
|
||||
if ( NULL == node )
|
||||
{
|
||||
VSILOGI("Call vxTensorScaleNode fail.(resize)");
|
||||
}
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
} /* _setup() */
|
||||
|
||||
#define REGISTER_RESIZE_OPENVX_KERNEL(KERNEL_NAME) \
|
||||
static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
|
||||
( \
|
||||
vsi_nn_graph_t * graph, \
|
||||
vsi_nn_tensor_t ** inputs, \
|
||||
size_t input_num, \
|
||||
vsi_nn_tensor_t ** outputs, \
|
||||
size_t output_num, \
|
||||
const vsi_nn_kernel_param_t * params, \
|
||||
vsi_nn_kernel_t * kernel \
|
||||
) \
|
||||
{ \
|
||||
return _setup(graph, inputs, input_num, outputs, output_num, \
|
||||
params, kernel); \
|
||||
} \
|
||||
REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
|
||||
|
||||
REGISTER_RESIZE_OPENVX_KERNEL( resize_nearest )
|
||||
REGISTER_RESIZE_OPENVX_KERNEL( resize_bilinear )
|
||||
|
||||
#undef REGISTER_RESIZE_OPENVX_KERNEL
|
||||
|
|
@ -32,7 +32,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_lut.h"
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
|
|
@ -46,57 +45,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
)
|
||||
{
|
||||
vx_node node = NULL;
|
||||
#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
|
||||
vx_lut lut1 = NULL;
|
||||
vx_lut lut2 = NULL;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_lut_params lut_param;
|
||||
|
||||
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
|
||||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
lut_param.act_type = VSI_NN_KERNEL_LUT_SQUARE;
|
||||
|
||||
lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
|
||||
lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
|
||||
if( NULL == lut1 || NULL == lut2 )
|
||||
{
|
||||
VSILOGE("create lut object fail.");
|
||||
goto final;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_lut(lut1, lut2, &lut_param);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
|
||||
if ( NULL == node )
|
||||
{
|
||||
node = vxActivationLayer(
|
||||
graph->g,
|
||||
inputs[0]->t,
|
||||
VX_NN_ACTIVATION_SQUARE,
|
||||
0,
|
||||
0,
|
||||
outputs[0]->t
|
||||
);
|
||||
}
|
||||
|
||||
final:
|
||||
if (lut1)
|
||||
{
|
||||
vxReleaseLUT(&lut1);
|
||||
lut1 = NULL;
|
||||
}
|
||||
if (lut2)
|
||||
{
|
||||
vxReleaseLUT(&lut2);
|
||||
lut2 = NULL;
|
||||
}
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
#else
|
||||
node = vxActivationLayer(
|
||||
graph->g,
|
||||
inputs[0]->t,
|
||||
|
|
@ -107,7 +56,6 @@ final:
|
|||
);
|
||||
|
||||
return (vsi_nn_kernel_node_t)node;
|
||||
#endif
|
||||
} /* _setup() */
|
||||
|
||||
#define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME) \
|
||||
|
|
|
|||
|
|
@ -0,0 +1,478 @@
|
|||
__kernel void cumsum_F32toF32_axis2(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int channel,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord_out = coord;
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord_out.z = channel - 1;
|
||||
write_imagef(output, coord_out, sum);
|
||||
|
||||
for(coord.z = channel - 1; coord.z > 0; coord.z--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord_out.z--;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord_out, sum);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord_out.z = 0;
|
||||
write_imagef(output, coord_out, sum);
|
||||
for(coord.z = 0; coord.z < channel - 1; coord.z++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord_out.z++;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord_out, sum);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.z = channel - 1; coord.z >= 0; coord.z--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord, sum);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.z = 0; coord.z < channel; coord.z++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord, sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_U8toU8_axis2(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int channel,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord_out = coord;
|
||||
|
||||
uint4 sum = (uint4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
|
||||
float cnt = 0.0f;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord_out.z = channel - 1;
|
||||
write_imageui(output, coord_out, dst);
|
||||
for(coord.z = channel - 1; coord.z > 0; coord.z--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
coord_out.z--;
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord_out, dst);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord_out.z = 0;
|
||||
write_imageui(output, coord_out, dst);
|
||||
for(coord.z = 0; coord.z < channel - 1; coord.z++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
coord_out.z++;
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord_out, dst);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.z = channel - 1; coord.z >= 0; coord.z--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord, dst);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.z = 0; coord.z < channel; coord.z++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_F32toF32_axis1(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int channel,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord_out = coord;
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord_out.y = height - 1;
|
||||
write_imagef(output, coord_out, sum);
|
||||
for(coord.y = height - 1; coord.y > 0; coord.y--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord_out.y--;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord_out, sum);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord_out.y = 0;
|
||||
write_imagef(output, coord_out, sum);
|
||||
for(coord.y = 0; coord.y < height - 1; coord.y++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord_out.y++;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord_out, sum);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.y = height - 1; coord.y >= 0; coord.y--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord, sum);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord, sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_U8toU8_axis1(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int channel,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord_out = coord;
|
||||
|
||||
uint4 sum = (uint4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
|
||||
float cnt = 0;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord_out.y = height - 1;
|
||||
write_imageui(output, coord_out, dst);
|
||||
|
||||
for(coord.y = height - 1; coord.y > 0; coord.y--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
cnt += 1.0f;
|
||||
coord_out.y--;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord_out, dst);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord_out.y = 0;
|
||||
write_imageui(output, coord_out, dst);
|
||||
for(coord.y = 0; coord.y < height - 1; coord.y++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
cnt += 1.0f;
|
||||
coord_out.y++;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord_out, dst);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.y = height - 1; coord.y >= 0; coord.y--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord, dst);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_F32toF32_axis0(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int channel,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord_out = coord;
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord_out.x = width - 1;
|
||||
write_imagef(output, coord_out, sum);
|
||||
for(coord.x = width - 1; coord.x > 0; coord.x--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord_out.x--;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord_out, sum);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord_out.x = 0;
|
||||
write_imagef(output, coord_out, sum);
|
||||
for(coord.x = 0; coord.x < width - 1; coord.x++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord_out.x++;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord_out, sum);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.x = width - 1; coord.x >= 0; coord.x--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord, sum);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.x = 0; coord.x < width; coord.x++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord, sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_U8toU8_axis0(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int channel,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord_out = coord;
|
||||
|
||||
uint4 sum = (uint4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
|
||||
float cnt = 0;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord_out.x = width - 1;
|
||||
write_imageui(output, coord_out, dst);
|
||||
for(coord.x = width - 1; coord.x > 0; coord.x--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
coord_out.x--;
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord_out, dst);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord_out.x = 0;
|
||||
write_imageui(output, coord_out, dst);
|
||||
for(coord.x = 0; coord.x < width - 1; coord.x++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
coord_out.x++;
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord_out, dst);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.x = width - 1; coord.x >= 0; coord.x--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord, dst);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.x = 0; coord.x < width; coord.x++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,314 @@
|
|||
|
||||
__kernel void cumsum_F32toF32_axis1_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int chn,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.w = height - 1;
|
||||
write_imagef(output, coord.zw, sum);
|
||||
for(coord.y = height - 1; coord.y > 0; coord.y--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
coord.w--;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord.zw, sum);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
write_imagef(output, coord.zw, sum);
|
||||
for(coord.y = 0; coord.y < height - 1; coord.y++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
coord.w++;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord.zw, sum);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.y = height - 1; coord.y >= 0; coord.y--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord.xy, sum);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord.xy, sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_U8toU8_axis1_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int chn,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
uint4 sum = (uint4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
|
||||
float cnt = 0;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.w = height - 1;
|
||||
write_imageui(output, coord.zw, sum);
|
||||
for(coord.y = height - 1; coord.y > 0; coord.y--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
coord.w--;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
write_imageui(output, coord.zw, sum);
|
||||
for(coord.y = 0; coord.y < height - 1; coord.y++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
coord.w++;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.y = height - 1; coord.y >= 0; coord.y--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_F32toF32_axis0_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int chn,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
float4 sum = (float4)(0);
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.x = width - 1;
|
||||
coord.z = coord.x;
|
||||
write_imagef(output, coord.zw, sum);
|
||||
for(; coord.x > 0; coord.x--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
coord.z--;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord.zw, sum);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord.z = 0;
|
||||
write_imagef(output, coord.zw, sum);
|
||||
for(coord.x = 0; coord.x < width - 1; coord.x++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
coord.z++;
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord.zw, sum);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.x = width - 1; coord.x >= 0; coord.x--)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord.xy, sum);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.x = 0; coord.x < width; coord.x++)
|
||||
{
|
||||
float4 data = read_imagef(input, coord.xy);
|
||||
sum += data;
|
||||
|
||||
write_imagef(output, coord.xy, sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void cumsum_U8toU8_axis0_2D(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int axis,
|
||||
int exclusive,
|
||||
int rev,
|
||||
int width,
|
||||
int height,
|
||||
int chn,
|
||||
int input_zp,
|
||||
float in_out_scale,
|
||||
float in_out_zp_scale,
|
||||
float output_zp
|
||||
)
|
||||
{
|
||||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
|
||||
|
||||
uint4 sum = (uint4)(0);
|
||||
uint4 dst = (uint4)(0);
|
||||
|
||||
float cnt = 0.0f;
|
||||
|
||||
if(exclusive && rev)
|
||||
{
|
||||
coord.x = width - 1;
|
||||
coord.z = coord.x;
|
||||
write_imageui(output, coord.zw, sum);
|
||||
for(; coord.x > 0; coord.x--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
coord.z--;
|
||||
cnt += 1.0;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(exclusive)
|
||||
{
|
||||
coord.z = 0;
|
||||
write_imageui(output, coord.zw, sum);
|
||||
for(coord.x = 0; coord.x < width - 1; coord.x++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
coord.z++;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.zw, dst);
|
||||
}
|
||||
}
|
||||
else if(rev)
|
||||
{
|
||||
for(coord.x = width - 1; coord.x >= 0; coord.x--)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(coord.x = 0; coord.x < width; coord.x++)
|
||||
{
|
||||
uint4 data = read_imageui(input, coord.xy);
|
||||
cnt += 1.0f;
|
||||
sum += data;
|
||||
|
||||
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
|
||||
float tmpSum = sum.x * in_out_scale + tmpAlpha;
|
||||
|
||||
dst.x = (uint)convert_int_rte(tmpSum);
|
||||
write_imageui(output, coord.xy, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue