Update internal for 22Q2 release (#432)

* Update internal for 22Q2 release

update to internal commit-id: e96103281b08404cabb9b65306587627cfa3cb93

Signed-off-by: yuenan.li <yuenan.li@verisilicon.com>

* Update prebuilt for 22Q2 release

Signed-off-by: yuenan.li <yuenan.li@verisilicon.com>

Co-authored-by: yuenan.li <yuenan.li@verisilicon.com>
This commit is contained in:
liyuenan 2022-07-25 09:29:22 +08:00 committed by GitHub
parent 9f331ed5ec
commit 7d88a668e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
256 changed files with 24998 additions and 22686 deletions

1
.gitignore vendored
View File

@ -336,3 +336,4 @@ ASALocalRun/
# IDE
.settings/
build/
*_build/

View File

@ -1 +1 @@
REL/6.4.10.2
6.4.11

View File

@ -499,6 +499,8 @@ enum vx_kernel_e {
VX_KERNEL_NN_DECONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x31,
VX_KERNEL_STREAM_PROCESSOR = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x32,
VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
};

View File

@ -196,4 +196,45 @@ VX_DECONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support d
#define VX_TENSOR_STRIDE_X_BITS_SUPPORT 1
#endif
/*
VX_REMOVE_RESHAPE_SUPPORT is used to declare if graph opt support to remove reshape op, if support, it's not need to remove reshape in ovxlib.
0: not support
1: support
*/
/*
#ifndef VX_REMOVE_RESHAPE_SUPPORT
#define VX_REMOVE_RESHAPE_SUPPORT 0
#endif
*/
/*
VX_STREAM_PROCESSOR_SUPPORT is used to declare that vsi openvx driver can support vxStreamProcessorNode API
[value]
0: not support
1: support
*/
#ifndef VX_STREAM_PROCESSOR_SUPPORT
#define VX_STREAM_PROCESSOR_SUPPORT 0
#endif
/*
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL is used to declare that this tensor connect to fixed DMA channel.
[value]
0: not support
1: support
*/
#ifndef VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL
#define VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL 1
#endif
/*
VX_SCALE_EXTRA_PARAMETER_SUPPORT is used to declare that RESIZE can support align_cornor and half_pixel_center parameter
[value]
0: not support
1: support
*/
#ifndef VX_SCALE_EXTRA_PARAMETER_SUPPORT
#define VX_SCALE_EXTRA_PARAMETER_SUPPORT 1
#endif
#endif /* __VX_KHR_COMPATIBLE_H__ */

View File

@ -57,6 +57,12 @@ enum vx_graph_attribute_internal_type_e
VX_GRAPH_AXI_SRAM_PRE_LOAD = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x2,
/*! \brief Queries a graph for its running priority (read-write. Use a <tt>\ref vx_uint32</tt> parameter. */
VX_GRAPH_PRIORITY_VALUE_VIV = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x3,
VX_GRAPH_PSI_EXTRATOR_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x4,
VX_GRAPH_PSI_FILLER_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x5,
VX_GRAPH_DENOISE_POSTPROCESS_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x6,
VX_GRAPH_DATA_COMPRESSION_RATIO = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x7,
VX_GRAPH_ISP_EMULATION_PARAMETER = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x8,
VX_GRAPH_PROCESS_FPS = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_GRAPH) + 0x9,
};
/*! \brief Size Alignment of User Memory
@ -209,7 +215,8 @@ enum vx_nn_activation_function_e
VX_NN_ACTIVATION_LEAKYRELU_MAX_POOLING = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x4,
VX_NN_ACTIVATION_SWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x5,
VX_NN_ACTIVATION_HSWISH = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x6,
VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
VX_NN_ACTIVATION_CUSTOM = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x7,
VX_NN_ACTIVATION_NONE = VX_ENUM_BASE(VX_ID_VIVANTE, VX_ENUM_NN_ACTIVATION_FUNCTION_TYPE) + 0x8,
};
/*! \brief The Convolutional network type
@ -285,6 +292,59 @@ enum vx_tensor_rank_type_e
VX_TENSOR_RANK_SN,
};
/*! \brief The attribute of tensor.
* \ingroup group_tensor
* \version 0.4
*/
enum vx_tensor_priority_e
{
/*! \brief no special requirement */
VX_TENSOR_DEFAULT = 0,
/*! \brief 2nd input(reference) */
/*VX_TENSOR_2ND_INPUT_FOR = 1,*/
VX_TENSOR_FOR_GRAPH_REFERENCE = 1,
};
/*! \brief The attribute of tensor memory.
* \ingroup group_tensor
* \version 0.4
*/
enum vx_tensor_memory_attribute_e
{
/*! \brief no special requirement */
VX_TENSOR_MEMORY_DEFAULT = 0,
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_0 = (0x1 << 0),
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_1 = (0x1 << 1),
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_2 = (0x1 << 2),
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_3 = (0x1 << 3),
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_4 = (0x1 << 4),
/*
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_5 = (0x1 << VX_DMA5_IN_ISP_OCM_PSI),
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_6 = (0x1 << VX_DMA6_DDR_DECOMPRESS),
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_7 = (0x1 << VX_DMA7_POSTOUT_OCM_ISP),
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_8 = (0x1 << VX_DMA8_COMPRESS_DDR),
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_9 = (0x1 << VX_DMA9_ISP_PATTERN_GENERATOR),
VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL_10 = (0x1 << VX_DMA10_ISP_CHECKSUM_GENERATOR),
*/
/*! \brief DMA transfer data to VIP and enable circular buffer */
#if !VX_TENSOR_MEMORY_CONNECT_DMA_CHANNEL
VX_TENSOR_MEMORY_ENABLE_CIRCULAR_BY_DMA = 0xFFFFFFFF,
#endif
};
enum vx_dma_extrator_pad_mode_e
{
/*! \brief no special requirement */
VX_DMA_EXTRATOR_PAD_CONST = 0,
/*! \brief DMA extrator pad with nearest edge */
VX_DMA_EXTRATOR_PAD_WITH_NEAREAST_EDGE = 1,
};
/*! \brief The precision of tensor.
* \ingroup group_tensor
* \version 0.4
@ -601,6 +661,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_int32* n
*/
VX_API_ENTRY vx_status VX_API_CALL vxSetTensorAttribute(vx_tensor tensor, vx_enum attribute, const void *ptr, vx_size size);
/*! \brief Creates an opaque reference to a tensor data buffer.
* \details The tensor is a dummy tensor which will not allocate any memory. And it cannot reshape or view.
* Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
* \param [in] context The reference to the implementation context.
* \param [in] number_of_dims The number of dimensions.
* \param [in] dims Dimensions sizes in elements.
* \param [in] data_format The <tt>\ref vx_type_e</tt> that represents the data format of the tensor data elements.
* \return A tensor data reference or zero when an error is encountered.
* \ingroup group_tensor
* \version 0.3
*/
VX_API_ENTRY vx_tensor VX_API_CALL vxCreateDummyTensor(vx_context context, vx_size number_of_dims, const vx_size *dims, vx_enum data_format);
/*! \brief The type enumeration lists all NN extension types.
* \ingroup group_cnn
@ -1317,6 +1390,13 @@ typedef struct _vx_nn_scale_params_t
vx_enum type; /*!< \brief The interpolation type, only support VX_INTERPOLATION_BILINEAR. */
} vx_nn_scale_params_t, * vx_nn_scale_params;
typedef struct _vx_nn_scale_params_ext_t
{
vx_nn_scale_params_t base;
vx_bool align_corners;
vx_bool half_pixel_centers;
} vx_nn_scale_params_ext_t, * vx_nn_scale_params_ext;
/*! \brief [Graph] Creates a scale Layer Node.
* \param [in] graph The reference to the parent graph.
* \param [in] input The input tensor data to scale.
@ -2054,8 +2134,15 @@ typedef struct _vx_hardware_caps_params_ext_t
vx_hardware_caps_params_t base;
vx_uint32 subGroupSize; /*!< \brief shader sub-group size.*/
vx_bool supportVA40; /*!< \brief support 40bit virtual address.*/
vx_uint32 supportStreamProcessor; /*!< \brief support stream processor.*/
} vx_hardware_caps_params_ext_t;
typedef struct _vx_hardware_caps_params_ext2_t
{
vx_hardware_caps_params_ext_t base;
vx_uint32 streamProcessorExecCount; /*!< \brief streamprocess execution count. */
} vx_hardware_caps_params_ext2_t;
/*! \brief Queries hardware caps information.
* \param [in] context The reference to the context.
* \param [in] hardware_caps_params <tt>\ref vx_hardware_caps_params_t </tt>.

View File

@ -219,6 +219,15 @@ typedef struct _vx_nn_convolution_relu_pooling_params_ext4_t
vx_bool enable_nn_tensor_add_relu; /*!< \brief Enable Relu function after tensor add. */
} vx_nn_convolution_relu_pooling_params_ext4_t, * vx_nn_convolution_relu_pooling_params_ext4;
typedef struct _vx_nn_convolution_relu_pooling_params_ext5_t
{
vx_nn_convolution_relu_pooling_params_ext4_t ext4; /*!< \brief convolution relu pooling params <tt>\ref vx_nn_convolution_relu_pooling_params_ext_t</tt> */
vx_object_array inputs_list;
vx_object_array outputs_list;
vx_spinst spinst_obj;
} vx_nn_convolution_relu_pooling_params_ext5_t, * vx_nn_convolution_relu_pooling_params_ext5;
/*! \brief [Graph] Creates a Convolutional Network Convolution and Activation(Relu) and Pooling Layer Node, this fucntion match kronos NN Extension 1.2 verion.
* \details This function implement Convolutional Network Convolution and Activation(Relu) and Pooling layer.
* For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,

View File

@ -963,6 +963,40 @@ VX_API_ENTRY vx_node VX_API_CALL vxBatchGemmNode(vx_graph graph,
vx_scalar trans_c,
vx_tensor output);
typedef struct _vx_lut_params_s
{
vx_enum lut_function; /*!< \brief Set VX_NN_ACTIVATION_NONE to disable lut table or set VX_NN_ACTIVATION_CUSTOM to customize lut table or set others to use fixed lut table */
vx_float32 float_values[4]; /*!< \brief Float parameters of fixed lut table */
vx_uint32 fvalues_count; /*!< \brief Count of float_values */
vx_int32 int_values[4]; /*!< \brief Int parameters of fixed lut table */
vx_uint32 ivalues_count; /*!< \brief Count of int_values */
vx_lut in_lut; /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */
vx_lut out_lut; /*!< \brief Only valid when lut_function is VX_NN_ACTIVATION_CUSTOM */
} vx_lut_params_s, * vx_lut_params;
/*! \brief Create a stream processor node.
* \param [in] graph The reference to the graph.
* \param [in] input_list The input tensor list.
* \param [in] input_count The input tensor count.
* \param [in] output_list The output tensor list.
* \param [in] output_count The output tensor count.
* \param [in] spinst_obj The stream processor instrunction object. Use vxCreateSPINST() to create.
* \param [in] lut_params The lut parameters. Refer to vx_lut_params_s.
* \return <tt>\ref vx_node</tt>.
* \retval vx_node A node reference. Any possible errors preventing a successful creation
* should be checked using <tt>\ref vxGetStatus</tt>
* \ingroup group_vision_function_sp
*/
VX_API_ENTRY vx_node VX_API_CALL vxStreamProcessorNode(
vx_graph graph,
vx_tensor* input_list,
vx_uint32 input_count,
vx_tensor* output_list,
vx_uint32 output_count,
vx_spinst spinst_obj,
vx_lut_params lut_params
);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,332 @@
/****************************************************************************
*
* Copyright 2017 - 2021 Vivante Corporation, Santa Clara, California.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* 'Software'), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject
* to the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL VIVANTE AND/OR ITS SUPPLIERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VX_SPINST_H_
#define _VX_SPINST_H_
#ifdef __cplusplus
extern "C" {
#endif
typedef enum _vx_sp_inst_type_e
{
VX_SP_INST_TYPE_FADD,
VX_SP_INST_TYPE_FMULT,
VX_SP_INST_TYPE_MOVE,
VX_SP_INST_TYPE_PWL,
VX_SP_INST_TYPE_COUNT,
}
vx_sp_inst_type_e;
typedef enum _vx_sp_inst_type_fadd_e
{
VX_SP_INST_TYPE_FADD_IDLE, // FADD-IDLE
VX_SP_INST_TYPE_FADD_ADD, // dst = src0 + src1
VX_SP_INST_TYPE_FADD_SUB, // dst = src0 - src1
VX_SP_INST_TYPE_FADD_COUNT,
}
vx_sp_inst_type_fadd_e;
typedef enum _vx_sp_inst_type_fmult_e
{
VX_SP_INST_TYPE_FMULT_IDLE, /* FMULT-IDLE */
VX_SP_INST_TYPE_FMULT_MUL, /* dst = src0 * src1 */
VX_SP_INST_TYPE_FMULT_MUL_CLAMP, /* dst = clamp (src0, src1, R6, R7) */
VX_SP_INST_TYPE_FMULT_COUNT,
}
vx_sp_inst_type_fmult_e;
typedef enum _vx_sp_inst_type_move_e
{
VX_SP_INST_TYPE_MOVE_IDLE,
VX_SP_INST_TYPE_MOVE_MOVE, // dst = src1
VX_SP_INST_TYPE_MOVE_SEL0, // dst = (src0 > 0) ? src1[0] : src1[1]
VX_SP_INST_TYPE_MOVE_SEL1, // dst = (src0 > 0) ? src1 : FA-src0 // use FA's SRC0
VX_SP_INST_TYPE_MOVE_IMMD, // dst = Constant assign immmediate
VX_SP_INST_TYPE_MOVE_ABS, // dst = abs(src1)
VX_SP_INST_TYPE_MOVE_COUNT,
}
vx_sp_inst_type_move_e;
typedef enum _vx_sp_inst_type_pwl_e
{
VX_SP_INST_TYPE_PWL_IDLE,
VX_SP_INST_TYPE_PWL_SETUP_0, /* PWL ID = 0 */
VX_SP_INST_TYPE_PWL_SETUP_1, /* Sigmode() */
VX_SP_INST_TYPE_PWL_SETUP_2, /* Tanh() */
VX_SP_INST_TYPE_PWL_COUNT,
}
vx_sp_inst_type_pwl_e;
typedef enum _vx_sp_inst_src_dst_e
{
VX_SP_INST_SPINOUT,
VX_SP_INST_SR1,
VX_SP_INST_SR2,
VX_SP_INST_SR3,
VX_SP_INST_SR4,
VX_SP_INST_SR5,
VX_SP_INST_SR6, /* nn_clamp_min */
VX_SP_INST_SR7, /* nn_clamp_max */
VX_SP_INST_SR8,
VX_SP_INST_SR9,
VX_SP_INST_SR10,
VX_SP_INST_VR11,
VX_SP_INST_VR12,
VX_SP_INST_VR13,
VX_SP_INST_VR14,
VX_SP_INST_SETUPOUT, /* Input of PWL Mult and Add: FMInA, FMInB, FAInA, FAInB */
}
vx_sp_inst_src_dst_e;
typedef struct _vx_spinst_unit_param
{
vx_enum op; /* vx_sp_inst_type_e */
struct
{
vx_enum op; /* vx_sp_inst_type_fadd/fmult/move/pwl_e */
struct
{
vx_uint8 src0; /* vx_sp_inst_src_dst_e */
vx_uint8 src1; /* vx_sp_inst_src_dst_e */
vx_uint8 dst; /* vx_sp_inst_src_dst_e */
vx_float32 constant;
} var;
} sub;
}
vx_spinst_unit_param;
/**********************************************************************************************/
typedef enum _vx_sp_attribute_e
{
VX_SP_ATTRIBUTE_NONE,
VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING,
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_X,
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Y,
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_Z,
VX_SP_ATTRIBUTE_PROG_INIT_INSTR_NUM,
VX_SP_ATTRIBUTE_PROG_LOOP_INSTR_NUM,
VX_SP_ATTRIBUTE_PROG_COMPLETE_INSTR_NUM,
VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE,
VX_SP_ATTRIBUTE_INPUT_SETUP,
VX_SP_ATTRIBUTE_IGNORED_LEADING_OUTPUTS,
VX_SP_ATTRIBUTE_FLUSH_CYCLE_NUM,
VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_WR,
VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_WR,
VX_SP_ATTRIBUTE_IGNORED_LEADING_V11_RD,
VX_SP_ATTRIBUTE_IGNORED_LEADING_V12_RD,
VX_SP_ATTRIBUTE_CH0_POST_REDISTRIBUTE,
VX_SP_ATTRIBUTE_CH1_POST_REDISTRIBUTE,
VX_SP_ATTRIBUTE_V11_RESET_AT_START,
VX_SP_ATTRIBUTE_V12_RESET_AT_START,
VX_SP_ATTRIBUTE_V11_POP_CONFIG,
VX_SP_ATTRIBUTE_V12_POP_CONFIG,
VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT,
VX_SP_ATTRIBUTE_IGNORED_LEADING_ACC_OUT,
VX_SP_ATTRIBUTE_SUM_ENGINE_RESET,
VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL,
VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE,
VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE,
VX_SP_ATTRIBUTE_GENERAL_COUNT,
VX_SP_ATTRIBUTE_CONST0, /* NN post multiplier */
VX_SP_ATTRIBUTE_CONST1, /* NN neg pos multiplier */
VX_SP_ATTRIBUTE_CONST2, /* NN tensor add const */
VX_SP_ATTRIBUTE_CONST3, /* NN clamp max */
VX_SP_ATTRIBUTE_CONST4, /* NN clmap min */
VX_SP_ATTRIBUTE_TOTAL_COUNT,
}
vx_sp_attribute_e;
typedef enum _vx_sp_attribute_input_tile_mapping_e
{
VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_XYMERGE,
VX_SP_ATTRIBUTE_INPUT_TILE_MAPPING_YZMERGE,
}
vx_sp_attribute_input_tile_mapping_e;
typedef enum _vx_sp_attribute_output_collapse_e
{
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_DISABLED,
VX_SP_ATTRIBUTE_OUTPUT_COLLAPSE_ENABLED,
}
vx_sp_attribute_output_collapse_e;
typedef enum _vx_sp_attribute_rounding_mode_e
{
VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_RTNE,
VX_SP_ATTRIBUTE_PROG_ROUNDING_MODE_STICKY,
}
vx_sp_attribute_rounding_mode_e;
typedef enum _vx_sp_attribute_input_setup_e
{
VX_SP_ATTRIBUTE_INPUT_SETUP_SINGLE_INPUT,
VX_SP_ATTRIBUTE_INPUT_SETUP_INTERLEAVE_TWO_INPUTS,
VX_SP_ATTRIBUTE_INPUT_SETUP_V11,
VX_SP_ATTRIBUTE_INPUT_SETUP_V12,
}
vx_sp_attribute_input_setup_e;
typedef enum _vx_sp_attribute_ch_post_redistribute_e
{
VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_DISABLED,
VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_SCALAR_GATHER,
VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_GATHER,
VX_SP_ATTRIBUTE_CH_POST_REDISTRIBUTE_VECTOR_SCATTER,
}
vx_sp_attribute_ch_post_redistribute_e;
typedef enum _vx_sp_attribute_v_reset_at_start_e
{
VX_SP_ATTRIBUTE_V_RESET_AT_START_NONE,
VX_SP_ATTRIBUTE_V_RESET_AT_START_RESET,
}
vx_sp_attribute_v_reset_at_start_e;
typedef enum _vx_sp_attribute_v_pop_config_e
{
VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_READ,
VX_SP_ATTRIBUTE_V_POP_CONFIG_EVERY_ROW,
}
vx_sp_attribute_v_pop_config_e;
typedef enum _vx_sp_attribute_accelerator_input_select_e
{
VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_OUTPUT,
VX_SP_ATTRIBUTE_ACCELERATOR_INPUT_SELECT_FROM_ACCLERATOR,
}
vx_sp_attribute_accelerator_input_select_e;
typedef enum _vx_sp_attribute_sum_engine_reset_e
{
VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_NONE,
VX_SP_ATTRIBUTE_SUM_ENGINE_RESET_RESET,
}
vx_sp_attribute_sum_engine_reset_e;
typedef enum _vx_sp_attribute_sum_engine_control_e
{
VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_INTERNAL,
VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_1D,
VX_SP_ATTRIBUTE_SUM_ENGINE_CONTROL_ACCUM_2D,
}
vx_sp_attribute_sum_engine_control_e;
typedef enum _vx_sp_attribute_sum_engine_num_ch_minus_one_e
{
VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_ONE_CH,
VX_SP_ATTRIBUTE_SUM_ENGINE_NUM_CH_MINUS_ONE_TWO_CH,
}
vx_sp_attribute_sum_engine_num_ch_minus_one_e;
typedef enum _vx_sp_attribute_sum_engine_2d_accum_storage_e
{
VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_SAME,
VX_SP_ATTRIBUTE_SUM_ENGINE_2D_ACCUM_STORAGE_DIFFERENT,
}
vx_sp_attribute_sum_engine_2d_accum_storage_e;
/**********************************************************************************************/
/*! \brief Creates an opaque reference to a spinst data.
* \param [in] context The reference to the implementation context.
* \return A spinst data reference.
* \Any possible errors preventing a successful creation should be checked using <tt>\ref vxGetStatus</tt>.
* \ingroup group_object_spinst
*/
VX_API_ENTRY vx_spinst VX_API_CALL vxCreateSPINST(
vx_context context
);
/*! \brief Releases a reference to a spinst object.
* The object may not be garbage collected until its total reference count is zero.
* \param [in] spinst_obj The pointer to the spinst data to release.
* \post After returning from this function the reference is zeroed.
* \return A <tt>\ref vx_status_e</tt> enumeration.
* \retval VX_SUCCESS No errors; all other values indicate failure
* \retval * An error occurred. See <tt>\ref vx_status_e</tt>.
* \ingroup group_object_spinst
*/
VX_API_ENTRY vx_status VX_API_CALL vxReleaseSPINST(
vx_spinst *spinst_obj
);
/*! \brief Add a instruction to spinst object.
* \param [in] spinst_obj The reference to the spinst object.
* \param [in] inst_unit_array The units of one instruction. Use a <tt>\ref vx_spinst_unit_param</tt>.
* \param [in] inst_unit_count The count of instruction units.
* \return A <tt>\ref vx_status_e</tt> enumeration.
* \retval VX_SUCCESS No errors.
* \retval VX_ERROR_INVALID_REFERENCE If data is not a <tt>\ref spinst_obj</tt>.
* \retval VX_ERROR_INVALID_PARAMETERS If any of parameters is incorrect.
* \retval VX_ERROR_NO_MEMORY If fail to allocate internal instruction memory.
* \ingroup group_object_spinst
*/
VX_API_ENTRY vx_status VX_API_CALL vxAddOneInstToSPINST(
vx_spinst spinst_obj,
vx_spinst_unit_param* inst_unit_array,
vx_uint8 inst_unit_count
);
/*! \brief Set various attributes of a spinst data.
* \param [in] spinst_obj The reference to the vx_spinst object to set.
* \param [in] attribute The attribute to set. Use a <tt>\ref vx_sp_attribute_e</tt>.
* \param [in] value The value of attribute.
* \return A <tt>\ref vx_status_e</tt> enumeration.
* \retval VX_SUCCESS No errors.
* \retval VX_ERROR_INVALID_REFERENCE If data is not a <tt>\ref vx_spinst</tt>.
* \retval VX_ERROR_INVALID_PARAMETERS If any of attribute is incorrect.
* \ingroup group_object_spinst
*/
VX_API_ENTRY vx_status VX_API_CALL vxSetAttributeToSPINST(
vx_spinst spinst_obj,
vx_enum attribute,
vx_uint32 value
);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -342,6 +342,10 @@ typedef struct _vx_tensorpatch_addressing_t * vx_trensor_addressing;
*/
typedef struct _vx_weights_biases_parameter_s * vx_weights_biases_parameter;
/*! \brief The object for stream processor
* \ingroup group_spinst
*/
typedef struct _vx_spinst_s * vx_spinst;
/*! \brief A Boolean value.
* This allows 0 to be FALSE, as it is in C, and any non-zero to be TRUE.
@ -470,6 +474,7 @@ enum vx_type_e {
/* \todo add new object types here */
VX_TYPE_BFLOAT16 = 0x81A,/*!< \brief A <tt>\ref vx_bfloat16</tt>. */
VX_TYPE_SPINST = 0x81B,/*!< \brief A <tt>\ref vx_spinst</tt>. */
VX_TYPE_INT4 = 0x81C,/*!< \brief A <tt>\ref signed 4bits tensor.</tt>. */
VX_TYPE_UINT4 = 0x81D,/*!< \brief A <tt>\ref unsigned 4bits tensor.</tt>. */
};
@ -1021,6 +1026,8 @@ enum vx_node_attribute_e {
VX_NODE_ATTRIBUTE_CONST_TENSOR_CACHE = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0x9,
VX_NODE_ATTRIBUTE_FOR_HW_QUALITY = VX_ATTRIBUTE_BASE(VX_ID_KHRONOS, VX_TYPE_NODE) + 0xA,
};
/*! \brief The parameter attributes list
@ -1290,6 +1297,9 @@ enum vx_tensor_attribute_e
VX_TENSOR_LIFETIME = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x5,
/*! \brief the value status of tensor. */
VX_TENSOR_VALUE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x6,
/*XiaoMi project*/
VX_TENSOR_INPUT_FOR_REFERENCE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x7,
VX_TENSOR_MEMORY_ATTRIBUTE = VX_ATTRIBUTE_BASE(VX_ID_VIVANTE, VX_TYPE_TENSOR) + 0x8,
};
/*! \brief The meta valid rectangle attributes.

View File

@ -1 +0,0 @@
libOpenVX.so.1.3.0

Binary file not shown.

View File

@ -1 +0,0 @@
libOpenVX.so.1.3.0

Binary file not shown.

View File

@ -172,3 +172,10 @@ DEF_OP(PRE_PROCESS_RGB888_PLANAR)
DEF_OP(GATHER_ELEMENTS)
DEF_OP(SELU)
DEF_OP(CELU)
DEF_OP(MAX_POOL3D)
DEF_OP(RCP)
DEF_OP(SIGN)
DEF_OP(SOFTSIGN)
DEF_OP(CUMSUM)
DEF_OP(MAXPOOLWITHARGMAX)
DEF_OP(MOD)

View File

@ -25,10 +25,13 @@
#ifndef _VSI_NN_GPU_CONFIG_H
#define _VSI_NN_GPU_CONFIG_H
#define GPU_TENSOR_MAX_WIDTH (65536)
#ifdef VSI_40BIT_VA_SUPPORT
#define GPU_TENSOR_MAX_WIDTH (1 << 30)
#else
#define GPU_TENSOR_MAX_WIDTH (1 << 16)
#endif
#define GPU_MAX_MULTIPLIER_NUM (65535)
#define GPU_MAX_POST_SHIFT_BITS (31)
#define GPU_TENSOR_DIM_2 (2)
#endif

View File

@ -156,6 +156,8 @@ typedef struct
vsi_nn_kernel_quant_asymm_t asymm;
vsi_nn_kernel_quant_asymm_perchannel_t asymm_v;
};
float scale;
int32_t zero_point;
} vsi_nn_kernel_tensor_attr_t;
typedef struct
@ -411,7 +413,7 @@ vsi_status vsi_nn_kernel_node_pass_param
size_t num
);
static inline void vsi_nn_kernel_node_release
static VSI_INLINE_API void vsi_nn_kernel_node_release
(
vsi_nn_kernel_node_t * node
)
@ -422,7 +424,7 @@ static inline void vsi_nn_kernel_node_release
}
}
static inline void vsi_nn_kernel_node_pack_io
static VSI_INLINE_API void vsi_nn_kernel_node_pack_io
(
vsi_nn_kernel_node_param_t * params,
size_t param_num,
@ -476,7 +478,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
);
/** Map data type to gpu internal dtype. */
static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
(
vsi_nn_type_e dtype
)
@ -516,7 +518,7 @@ static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
return I8;
} /* vsi_nn_kernel_map_dtype() */
static inline vsi_nn_type_e vsi_nn_dtype_map_kernel
static VSI_INLINE_API vsi_nn_type_e vsi_nn_dtype_map_kernel
(
vsi_nn_kernel_dtype_e dtype
)
@ -556,7 +558,7 @@ static inline vsi_nn_type_e vsi_nn_dtype_map_kernel
return VSI_NN_TYPE_INT8;
} /* vsi_nn_kernel_map_dtype() */
static inline size_t vsi_nn_kernel_dtype_get_bytes
static VSI_INLINE_API size_t vsi_nn_kernel_dtype_get_bytes
(
vsi_nn_kernel_dtype_e dtype
)
@ -585,7 +587,7 @@ static inline size_t vsi_nn_kernel_dtype_get_bytes
return 0;
} /* vsi_nn_kernel_dtype_get_bytes() */
static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
static VSI_INLINE_API vsi_size_t vsi_nn_kernel_dtype_get_bits
(
vsi_nn_kernel_dtype_e dtype
)
@ -617,7 +619,7 @@ static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
return 0;
} /* vsi_nn_kernel_dtype_get_bits() */
static inline vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
static VSI_INLINE_API vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
( vsi_nn_qnt_type_e quant_type )
{
switch( quant_type )
@ -658,7 +660,7 @@ vsi_nn_kernel_scalar_t vsi_nn_kernel_scalar_create
const void * data
);
static inline void vsi_nn_kernel_scalar_release
static VSI_INLINE_API void vsi_nn_kernel_scalar_release
( vsi_nn_kernel_scalar_t * scalar )
{
if( scalar && *scalar )
@ -803,7 +805,7 @@ vsi_status vsi_nn_kernel_tensor_write
size_t size
);
static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_size
( const vsi_nn_kernel_tensor_attr_t * attr )
{
if( !attr )
@ -813,7 +815,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
return vsi_nn_shape_get_size( attr->shape->data, (vsi_size_t)attr->shape->size );
} /* vsi_nn_kernel_tensor_attr_get_size() */
static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
static VSI_INLINE_API vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
( const vsi_nn_kernel_tensor_attr_t * attr )
{
vsi_size_t i = 0;
@ -851,7 +853,7 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
return bytes;
} /* vsi_nn_kernel_tensor_attr_get_bytes() */
static inline void vsi_nn_kernel_tensor_attr_get_stride
static VSI_INLINE_API void vsi_nn_kernel_tensor_attr_get_stride
( const vsi_nn_kernel_tensor_attr_t * attr, vsi_size_t * out_stride)
{
vsi_size_t type_bits;
@ -902,7 +904,7 @@ static inline void vsi_nn_kernel_tensor_attr_get_stride
}
} /* vsi_nn_kernel_tensor_attr_get_size() */
static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
static VSI_INLINE_API vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
( const vsi_nn_kernel_tensor_attr_t * attr )
{
return ( attr && attr->quant > VSI_NN_KERNEL_QUANT_NONE
@ -1072,7 +1074,7 @@ OVXLIB_API vsi_status vsi_nn_KernelGpuConfig
const gpu_param_t * gpu_param
);
static inline const char* vsi_nn_kernel_type_str
static VSI_INLINE_API const char* vsi_nn_kernel_type_str
(
vsi_nn_kernel_type_e type
)
@ -1095,7 +1097,7 @@ static inline const char* vsi_nn_kernel_type_str
return "None";
} /* vsi_nn_kernel_type_str() */
static inline vsi_status vsi_nn_kernel_unpack_4bit_data
static VSI_INLINE_API vsi_status vsi_nn_kernel_unpack_4bit_data
(
const vsi_nn_kernel_tensor_attr_t * attr,
uint8_t * src,
@ -1162,7 +1164,7 @@ static inline vsi_status vsi_nn_kernel_unpack_4bit_data
return status;
}
static inline vsi_status vsi_nn_kernel_pack_4bit_data
static VSI_INLINE_API vsi_status vsi_nn_kernel_pack_4bit_data
(
const vsi_nn_kernel_tensor_attr_t * attr,
uint8_t * src,

View File

@ -46,6 +46,8 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
VSI_NN_KERNEL_LUT_CLIP = 12,
VSI_NN_KERNEL_LUT_SQUARE = 13,
VSI_NN_KERNEL_LUT_CELU = 14,
VSI_NN_KERNEL_LUT_RCP = 15,
VSI_NN_KERNEL_LUT_SOFTSIGN = 16,
};
#define VSI_NN_KERNEL_LUT_MAX_SIZE (1024)

View File

@ -30,11 +30,20 @@
extern "C" {
#endif
typedef struct _vsi_nn_crop_lcl_data
{
vx_int32 begin_dims[VSI_NN_MAX_DIM_NUM];
vx_int32 end_dims[VSI_NN_MAX_DIM_NUM];
vx_int32 stride_dims[VSI_NN_MAX_DIM_NUM];
} vsi_nn_crop_lcl_data;
typedef struct _vsi_nn_crop_param
{
int32_t axis;
uint32_t dims;
uint32_t offset[VSI_NN_MAX_DIM_NUM];
vsi_nn_crop_lcl_data *lcl_data;
} vsi_nn_crop_param;
#ifdef __cplusplus

View File

@ -0,0 +1,45 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_CUMSUM_H
#define _VSI_NN_OP_CUMSUM_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_cumsum_param
{
int32_t axis;
vsi_bool exclusive;
vsi_bool reverse;
} vsi_nn_cumsum_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,55 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_MAX_POOL3D_H
#define _VSI_NN_OP_MAX_POOL3D_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_max_pool3d_param
{
struct _max_pool3d_local_data_t* local;
// Add parameters here
/* round_type is used to calculate the output shape */
vsi_nn_round_type_e round_type;
uint32_t ksize[3];
uint32_t stride[3];
/* Pad left, right, top, bottom */
uint32_t pad[6];
/* Pad type default value shall be AUTO */
vsi_nn_pad_e pad_type;
} vsi_nn_max_pool3d_param;
_compiler_assert(offsetof(vsi_nn_max_pool3d_param, local) == 0, \
vsi_nn_max_pool3d_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,44 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_MOD_H
#define _VSI_NN_OP_MOD_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_mod_param
{
int32_t fmod;
} vsi_nn_mod_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,47 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_RCP_H
#define _VSI_NN_OP_RCP_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_rcp_param
{
struct _rcp_local_data_t* local;
// Add parameters here
} vsi_nn_rcp_param;
_compiler_assert(offsetof(vsi_nn_rcp_param, local) == 0, \
vsi_nn_rcp_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,47 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_SIGN_H
#define _VSI_NN_OP_SIGN_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_sign_param
{
struct _sign_local_data_t* local;
// Add parameters here
} vsi_nn_sign_param;
_compiler_assert(offsetof(vsi_nn_sign_param, local) == 0, \
vsi_nn_sign_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,47 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_SOFTSIGN_H
#define _VSI_NN_OP_SOFTSIGN_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_softsign_param
{
struct _softsign_local_data_t* local;
// Add parameters here
} vsi_nn_softsign_param;
_compiler_assert(offsetof(vsi_nn_softsign_param, local) == 0, \
vsi_nn_softsign_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -32,7 +32,7 @@
extern "C" {
#endif
static inline vsi_bool type_is_integer
static VSI_INLINE_API vsi_bool type_is_integer
(
const vsi_nn_type_e type
)
@ -60,7 +60,7 @@ static inline vsi_bool type_is_integer
return ret;
} /* type_is_integer() */
static inline vsi_bool type_is_signed
static VSI_INLINE_API vsi_bool type_is_signed
(
const vsi_nn_type_e type
)
@ -86,7 +86,7 @@ static inline vsi_bool type_is_signed
return ret;
} /* type_is_signed() */
static inline uint32_t type_get_bytes
static VSI_INLINE_API uint32_t type_get_bytes
(
const vsi_nn_type_e type
)
@ -115,7 +115,7 @@ static inline uint32_t type_get_bytes
}
} /* type_get_bytes() */
static inline uint32_t type_get_bits
static VSI_INLINE_API uint32_t type_get_bits
(
const vsi_nn_type_e type
)
@ -147,7 +147,7 @@ static inline uint32_t type_get_bits
}
} /* type_get_bits() */
static inline void type_get_range
static VSI_INLINE_API void type_get_range
(
vsi_nn_type_e type,
double * max_range,
@ -186,7 +186,24 @@ static inline void type_get_range
}
} /* type_get_range() */
static inline int32_t fp32_to_affine
static VSI_INLINE_API vsi_bool fp32_is_inf
(
float val
)
{
uint32_t u_value = *(uint32_t*)&val;
if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
{
return TRUE;
}
else
{
return FALSE;
}
}
static VSI_INLINE_API int32_t fp32_to_affine
(
const float in,
const float scale,
@ -200,10 +217,17 @@ static inline int32_t fp32_to_affine
type_get_range( type, &max_range, &min_range );
data = (int32_t)(vsi_rint( in / scale ) + zero_point );
data = vsi_nn_max( (int32_t)min_range, vsi_nn_min( (int32_t)max_range , data ) );
if (fp32_is_inf(in) != 0)
{
uint32_t sign = (*(uint32_t*)&in) >> 31;
data = sign == 1 ? (int32_t)min_range : (int32_t)max_range;
}
return data;
} /* fp32_to_affine() */
static inline float affine_to_fp32
static VSI_INLINE_API float affine_to_fp32
(
const int32_t val,
const float scale,
@ -216,7 +240,7 @@ static inline float affine_to_fp32
return data;
} /* affine_to_fp32() */
static inline int32_t fp32_to_dfp
static VSI_INLINE_API int32_t fp32_to_dfp
(
const float in,
const int8_t fl,
@ -237,10 +261,17 @@ static inline int32_t fp32_to_dfp
}
data = vsi_nn_min( data, (int32_t)max_range );
data = vsi_nn_max( data, (int32_t)min_range );
if (fp32_is_inf(in) != 0)
{
uint32_t sign = (*(uint32_t*)&in) >> 31;
data = sign == 1 ? (int32_t)min_range : (int32_t) max_range;
}
return data;
} /* fp32_to_dfp() */
static inline float dfp_to_fp32
static VSI_INLINE_API float dfp_to_fp32
(
const int32_t val,
const int8_t fl,
@ -259,7 +290,7 @@ static inline float dfp_to_fp32
return result;
} /* dfp_to_fp32() */
static inline vsi_status integer_convert
static VSI_INLINE_API vsi_status integer_convert
(
const void * src,
vsi_nn_type_e src_type,
@ -303,7 +334,7 @@ typedef union
float f;
} _fp32_t;
static inline float fp16_to_fp32
static VSI_INLINE_API float fp16_to_fp32
(
int16_t in
)
@ -323,7 +354,7 @@ static inline float fp16_to_fp32
return o.f;
} /* fp16_to_fp32() */
static inline float bfp16_to_fp32
static VSI_INLINE_API float bfp16_to_fp32
(
int16_t in
)
@ -344,7 +375,7 @@ static inline float bfp16_to_fp32
return t3 == 0 ? 0 : out;
} /* bfp16_to_fp32() */
static inline uint16_t fp32_to_fp16
static VSI_INLINE_API uint16_t fp32_to_fp16
(
float in
)
@ -370,7 +401,7 @@ static inline uint16_t fp32_to_fp16
return (uint16_t) fp16;
} /* fp32_to_fp16() */
static inline uint16_t fp32_to_bfp16
static VSI_INLINE_API uint16_t fp32_to_bfp16
(
float in
)
@ -381,7 +412,7 @@ static inline uint16_t fp32_to_bfp16
return (uint16_t) t1;
} /* fp32_to_bfp16() */
static inline uint16_t fp32_to_bfp16_rtne
static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
(
float in
)
@ -409,7 +440,7 @@ static inline uint16_t fp32_to_bfp16_rtne
return out;
} /* fp32_to_bfp16_rtne */
static inline vsi_status dtype_to_float32
static VSI_INLINE_API vsi_status dtype_to_float32
(
uint8_t *src,
float *dst,
@ -461,7 +492,7 @@ static inline vsi_status dtype_to_float32
return VSI_SUCCESS;
}
static inline vsi_status float32_to_dtype
static VSI_INLINE_API vsi_status float32_to_dtype
(
float src,
uint8_t *dst,

View File

@ -42,6 +42,8 @@ extern "C" {
#define vsi_clamp(x, min, max) vsi_nn_clamp(x, min, max)
#define vsi_rtne(x) vsi_rint(x)
#define VSI_NN_INT32_MAX (0x7FFFFFFF)
#define VSI_NN_FLOAT32_INF (0x7F800000)
#define VSI_NN_FLOAT32_NAN (0x7FC00000)
#define VSI_NN_FLOAT64_INF (0x7FF0000000000000)
@ -53,14 +55,14 @@ extern "C" {
size_t size; \
TYPE data[0]; \
} vsi_##NAME##_array_t; \
static inline vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
static VSI_INLINE_API vsi_##NAME##_array_t * vsi_##NAME##_array_create( size_t size ) { \
vsi_##NAME##_array_t * array = (vsi_##NAME##_array_t *)malloc( \
sizeof(vsi_##NAME##_array_t) + sizeof(TYPE) * size ); \
if (array == NULL) return NULL; \
array->size = size; \
return array; \
} \
static inline void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \
static VSI_INLINE_API void vsi_##NAME##_array_release( vsi_##NAME##_array_t ** array ) \
{ \
if( array && *array ) { \
free( *array ); \
@ -167,7 +169,7 @@ void vsi_nn_random_uniform_transform
uint32_t len
);
static inline double copy_sign
static VSI_INLINE_API double copy_sign
(
double number,
double sign
@ -177,7 +179,7 @@ static inline double copy_sign
return (sign > 0) ? value : (-value);
} /* copy_sign() */
static inline float simple_round
static VSI_INLINE_API float simple_round
(
float x
)
@ -185,7 +187,7 @@ static inline float simple_round
return (float) copy_sign(floorf(fabsf(x) + 0.5f), x);
} /* simple_round() */
static inline double vsi_rint
static VSI_INLINE_API double vsi_rint
(
double x
)

View File

@ -65,7 +65,7 @@ extern "C" {
#define VSI_NN_DO_JOIN(X, Y) VSI_NN_DO_JOIN2(X,Y)
#define VSI_NN_DO_JOIN2(X, Y) X##Y
#if defined(_MSC_VER)
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
#define VSI_NN_DEPRECATED(symbol, hints) \
__declspec(deprecated(VSI_NN_STRINGIZE(hints))) symbol
@ -381,7 +381,7 @@ int32_t vsi_nn_partition
* @param[in] num Number of tensors.
* @param[out] out_tensors Ordered tensors
* */
static inline void vsi_nn_reorder_tensor
static VSI_INLINE_API void vsi_nn_reorder_tensor
(
vsi_nn_tensor_t** tensors,
const int32_t* order,
@ -417,6 +417,15 @@ vsi_bool vsi_nn_is_broadcast_operaton
vsi_nn_tensor_t * output
);
vsi_bool vsi_nn_is_broadcast_axes_operaton
(
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t * output,
int32_t * axis,
int32_t axis_num
);
float vsi_nn_get_tensor_scale
(
vsi_nn_tensor_t * tensor

View File

@ -66,6 +66,8 @@ typedef struct _vsi_nn_hw_config_t
uint32_t use_40bits_va;
uint32_t support_stream_processor;
uint32_t sp_exec_count;
uint32_t sp_vector_depth;
uint32_t sp_per_core_vector_depth;
} vsi_nn_hw_config_t;
typedef struct _vsi_nn_runtime_option_t

View File

@ -35,7 +35,7 @@
struct f##_t_{ ~f##_t_(void) { f(); }}; static f##_t_ f##_; \
static void f(void)
#elif defined(_MSC_VER)
#elif (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
#pragma section(".CRT$XCU", read)
#define _INITIALIZER2(f, p) \
static void f(void); \

View File

@ -27,7 +27,7 @@
#include "vsi_nn_types.h"
#include "vsi_nn_prv.h"
static inline vsi_bool vsi_nn_feature_conv_max_kernel_size()
static VSI_INLINE_API vsi_bool vsi_nn_feature_conv_max_kernel_size()
{
return 11;
}

View File

@ -31,7 +31,7 @@
extern "C"{
#endif
#ifdef _MSC_VER
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
#define snprintf(buffer, count, format, ...) \
_snprintf_s(buffer, count, _TRUNCATE, format, ##__VA_ARGS__)
#define vsnprintf(buffer, count, format, args) \

View File

@ -190,6 +190,12 @@
#include "ops/vsi_nn_op_gather_elements.h"
#include "ops/vsi_nn_op_selu.h"
#include "ops/vsi_nn_op_celu.h"
#include "ops/vsi_nn_op_max_pool3d.h"
#include "ops/vsi_nn_op_rcp.h"
#include "ops/vsi_nn_op_sign.h"
#include "ops/vsi_nn_op_softsign.h"
#include "ops/vsi_nn_op_cumsum.h"
#include "ops/vsi_nn_op_mod.h"
/* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h"
@ -365,6 +371,12 @@ typedef union _vsi_nn_nn_param
vsi_nn_gather_elements_param gather_elements;
vsi_nn_selu_param selu;
vsi_nn_celu_param celu;
vsi_nn_max_pool3d_param max_pool3d;
vsi_nn_rcp_param rcp;
vsi_nn_sign_param sign;
vsi_nn_softsign_param softsign;
vsi_nn_cumsum_param cumsum;
vsi_nn_mod_param mod;
void* client_param;
/* custom node data struct define */

View File

@ -243,6 +243,18 @@ OVXLIB_API vsi_status vsi_nn_AddBinaryGraphInputsWithCropParam
uint32_t enable_nodes_count
);
OVXLIB_API vsi_status vsi_nn_UpdateCropParamsForBinaryGraph
(
vsi_nn_graph_t* graph,
uint32_t enabled_crop_input_idx,
uint32_t start_x,
uint32_t start_y,
uint32_t crop_w,
uint32_t crop_h,
uint32_t dst_w,
uint32_t dst_h
);
#ifdef __cplusplus
}
#endif

View File

@ -26,7 +26,7 @@
#define _VSI_NN_PUB_H
#if !defined(OVXLIB_API)
#if defined(_WIN32)
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
#define OVXLIB_API __declspec(dllimport)
#else
#define OVXLIB_API __attribute__((visibility("default")))

View File

@ -33,11 +33,13 @@
extern "C"{
#endif
#ifdef _WIN32
#define inline __inline
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
#define VSI_INLINE_API __inline
#else
#define VSI_INLINE_API inline
#endif
#if (defined(_MSC_VER) || defined(__MINGW32))
#if (defined(_MSC_VER) || defined(_WIN32) || defined(__MINGW32))
#define SIZE_T_SPECIFIER "Iu"
#define SSIZE_T_SPECIFIER "Id"
#ifdef VSI_40BIT_VA_SUPPORT
@ -59,7 +61,7 @@ extern "C"{
#endif
#endif
#if defined(_MSC_VER)
#if (defined(_MSC_VER))
#include <BaseTsd.h>
typedef SSIZE_T ssize_t;
#else

View File

@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 1
#define VSI_NN_VERSION_PATCH 43
#define VSI_NN_VERSION_PATCH 50
#define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -188,7 +188,7 @@ static vsi_status _query_kernel
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (input_dtype == I8)
if (input_dtype == I8 || input_dtype == I16)
{
input_dtype = I32;
}
@ -269,7 +269,6 @@ static vsi_nn_kernel_node_t _setup
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
CHECK_STATUS_FAIL_GOTO( status, OnError );
}
}
@ -285,4 +284,3 @@ OnError:
__END_DECLS
REGISTER_BACKEND_CL( argmax, _setup )

View File

@ -188,6 +188,11 @@ static vsi_status _query_kernel
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (input_dtype == I8 || input_dtype == I16)
{
input_dtype = I32;
}
if (output_dtype == I16)
{
output_dtype = I32;
@ -264,7 +269,6 @@ static vsi_nn_kernel_node_t _setup
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
CHECK_STATUS_FAIL_GOTO( status, OnError );
}
}

View File

@ -0,0 +1,365 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "cumsum"
#define KERNEL_SOURCE_2 "cumsum_2d"
// Add kernel hashtable here
#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
KERNEL_SOURCE_1 },
#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
KERNEL_SOURCE_2 },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} cumsum_map[] =
{
HASH_CUMSUM_KERNELS(0, U8, U8)
HASH_CUMSUM_KERNELS(0, F32, F32)
HASH_CUMSUM_KERNELS(1, U8, U8)
HASH_CUMSUM_KERNELS(1, F32, F32)
HASH_CUMSUM_KERNELS(2, U8, U8)
HASH_CUMSUM_KERNELS(2, F32, F32)
HASH_CUMSUM_KERNELS_2D(0, U8, U8)
HASH_CUMSUM_KERNELS_2D(0, F32, F32)
HASH_CUMSUM_KERNELS_2D(1, U8, U8)
HASH_CUMSUM_KERNELS_2D(1, F32, F32)
};
/*
* Kernel params
*/
static vx_param_description_t _cumsum_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _CUMSUM_PARAM_NUM _cnt_of_array( _cumsum_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_cumsum_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * input_shape = NULL;
int32_t axis = 0;
int32_t width = 0;
int32_t height = 0;
int32_t channel = 0;
int32_t w = 1;
int32_t h = 1;
int32_t c = 1;
uint32_t dim = 1;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
CHECK_STATUS_FAIL_GOTO(status, final );
input_shape = attr[0]->shape;
dim = (uint32_t)input_shape->size;
width = (int32_t)(input_shape->data[0]);
height = (int32_t)(input_shape->data[1]);
channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
if (axis == 0)
{
w = 1;
h = height;
c = channel;
}
else if (axis == 1)
{
w = width;
h = 1;
c = channel;
}
else if (axis == 2)
{
w = width;
h = height;
c = 1;
}
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = w;
gpu_param.global_size[1] = h;
gpu_param.global_size[2] = c;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _cumsum_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t axis,
int32_t is_2d
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (input0_dtype == U32)
{
input0_dtype = U8;
}
if (input0_dtype == F16)
{
input0_dtype = F32;
}
if (output_dtype == U32)
{
output_dtype = U8;
}
if (output_dtype == F16)
{
output_dtype = F32;
}
key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
{
if ( cumsum_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(cumsum_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", cumsum_map[i].function_name );
kernel->info.parameters = _cumsum_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def );
kernel->info.initialize = _cumsum_initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
cumsum_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
cumsum_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_size_t shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}};
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
int32_t axis_new = 0;
int32_t is_2d = 0;
uint32_t rs_dim = 2;
int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
float in_out_scale = input_scale * output_scale;
float in_out_zp_scale = in_out_scale * input_zp;
int32_t width = 0;
int32_t height = 0;
int32_t channel = 1;
int32_t i = 0;
vsi_nn_kernel_optimize_softmax_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
shapes[0], &rs_dim, &axis_new);
if (rs_dim > 3)
{
return NULL;
}
width = (int32_t)shapes[0][0];
height = (int32_t)shapes[0][1];
if (rs_dim == 2)
{
is_2d = 1;
}
else
{
channel = (int32_t)shapes[0][2];
}
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], (vsi_size_t)rs_dim );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[0], (vsi_size_t)rs_dim );
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 2;
/* Pass parameters to node. */
vsi_nn_kernel_node_pack_io( node_params, _CUMSUM_PARAM_NUM,
reshape_tensors, 1, &reshape_tensors[1], 1 );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &channel );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_out_zp_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CUMSUM_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
}
}
for (i = 0; i < 2; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( cumsum, _setup )

View File

@ -53,6 +53,9 @@ typedef enum
UNARY_HGELU,
UNARY_SELU,
UNARY_CELU,
UNARY_RCP,
UNARY_SIGN,
UNARY_SOFTSIGN,
} unary_type_e;
/*
@ -94,6 +97,13 @@ typedef enum
#define HGELU_OPERATION hard_gelu
#define SELU_OPERATION selu
#define CELU_OPERATION celu
#define RCP_OPERATION rcp
#define SIGN_OPERATION sign
#define SOFTSIGN_OPERATION softsign
#define ADD_UNARY_SH_KERNELS(name, src_type, dst_type) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, src_type, dst_type) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, src_type, dst_type)
static const struct {
uint32_t key;
@ -101,61 +111,39 @@ static const struct {
const char* source_name;
} kernel_map[] =
{
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F32, F32)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F32, F32)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F32, F32)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F32, F32)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F32, F32)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F32, F32)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F32, F32)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F32, F32)
TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION, UNARY_HGELU, F32, F32)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F32, F32)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F32, F32)
ADD_UNARY_SH_KERNELS(SIN, F32, F32)
ADD_UNARY_SH_KERNELS(COS, F32, F32)
ADD_UNARY_SH_KERNELS(EXP, F32, F32)
ADD_UNARY_SH_KERNELS(LOG, F32, F32)
ADD_UNARY_SH_KERNELS(NEG, F32, F32)
ADD_UNARY_SH_KERNELS(HSIGMOID, F32, F32)
ADD_UNARY_SH_KERNELS(MISH, F32, F32)
ADD_UNARY_SH_KERNELS(ROUND, F32, F32)
ADD_UNARY_SH_KERNELS(GELU, F32, F32)
ADD_UNARY_SH_KERNELS(HGELU, F32, F32)
ADD_UNARY_SH_KERNELS(SELU, F32, F32)
ADD_UNARY_SH_KERNELS(CELU, F32, F32)
ADD_UNARY_SH_KERNELS(RCP, F32, F32)
ADD_UNARY_SH_KERNELS(SIGN, F32, F32)
ADD_UNARY_SH_KERNELS(SOFTSIGN, F32, F32)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F32, F32)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F32, F32)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F32, F32)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F32, F32)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F32, F32)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F32, F32)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F32, F32)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F32, F32)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F32, F32)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F32, F32)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F32, F32)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F32, F32)
ADD_UNARY_SH_KERNELS(SIN, U8, U8)
ADD_UNARY_SH_KERNELS(COS, U8, U8)
ADD_UNARY_SH_KERNELS(EXP, U8, U8)
ADD_UNARY_SH_KERNELS(LOG, U8, U8)
ADD_UNARY_SH_KERNELS(NEG, U8, U8)
ADD_UNARY_SH_KERNELS(HSIGMOID, U8, U8)
ADD_UNARY_SH_KERNELS(MISH, U8, U8)
ADD_UNARY_SH_KERNELS(ROUND, U8, U8)
ADD_UNARY_SH_KERNELS(GELU, U8, U8)
ADD_UNARY_SH_KERNELS(HGELU, U8, U8)
ADD_UNARY_SH_KERNELS(SELU, U8, U8)
ADD_UNARY_SH_KERNELS(CELU, U8, U8)
ADD_UNARY_SH_KERNELS(RCP, U8, U8)
ADD_UNARY_SH_KERNELS(SIGN, U8, U8)
ADD_UNARY_SH_KERNELS(SOFTSIGN, U8, U8)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, U8, U8)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, U8, U8)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, U8, U8)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, U8, U8)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, U8, U8)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8, U8)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8, U8)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8, U8)
TENSOR_UNARY_KERNELS_3D(HGELU_OPERATION, UNARY_HGELU, U8, U8)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8, U8)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8, U8)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8, U8)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8, U8)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32)
ADD_UNARY_SH_KERNELS(NEG, I32, I32)
};
#undef SIN_OPERATION
@ -170,6 +158,9 @@ static const struct {
#undef HGELU_OPERATION
#undef SELU_OPERATION
#undef CELU_OPERATION
#undef RCP_OPERATION
#undef SIGN_OPERATION
#undef SOFTSIGN_OPERATION
/*
* Kernel params
*/
@ -458,4 +449,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu, UNARY_GELU )
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu, UNARY_HGELU )
REGISTER_ELTWISE_UNARY_BACKEND_CL( selu, UNARY_SELU )
REGISTER_ELTWISE_UNARY_BACKEND_CL( celu, UNARY_CELU )
REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp, UNARY_RCP )
REGISTER_ELTWISE_UNARY_BACKEND_CL( sign, UNARY_SIGN )
REGISTER_ELTWISE_UNARY_BACKEND_CL( softsign, UNARY_SOFTSIGN )
__END_DECLS

View File

@ -123,7 +123,7 @@ static vsi_status cal_gather_tensor_reshape_size
uint32_t i = 0;
vsi_size_t elementCnt = 1;
vsi_size_t outerCnt = 1;
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
for (i = 0; i < dims_num - batch_dims; ++i)
{
@ -365,4 +365,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( gather, _setup )

View File

@ -111,7 +111,7 @@ static vsi_status cal_gather_nd_tensor_reshape_size
vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t i = 0;
vsi_size_t elementCnt = 1;
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
newDim[0] = 0;
for(i = 0; i < dims_num; ++i)
@ -336,4 +336,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( gather_nd, _setup )

View File

@ -22,7 +22,6 @@
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -44,21 +43,20 @@ __BEGIN_DECLS
*/
typedef enum
{
INTERNAL_KERNEL_MEAN_VARI,
INTERNAL_KERNEL_SUMS,
INTERNAL_KERNEL_NORM,
} _internal_kernel_e;
#define KERNEL_SOURCE_1 "instance_normalization_u8"
#define KERNEL_SOURCE_2 "instance_normalization_f16"
#define KERNEL_SOURCE_2 "instance_normalization_f32"
#define KERNEL_SOURCE_3 "instance_normalization_i32"
#define KERNEL_SOURCE_4 "instance_normalization_f32"
// Add kernel hashtable here
#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE)
#define HASH_INSTANCENORM_SUMS_KERNEL_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE)
#define HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("cl.instance_norm_meanvari_"#SRC0_TYPE"_2D")
#define HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("cl.instance_norm_sums_"#SRC0_TYPE"_2D")
#define HASH_INSTANCENORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.instance_norm_"#SRC0_TYPE"to"#DST_TYPE)
@ -68,17 +66,17 @@ typedef enum
// Add kernel hashtable here
// mean vari
#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \
#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 0), \
HASH_INSTANCENORM_MEAN_VARI_KERNEL_NAME(IN0_TYPE), \
#define TENSOR_INSTANCENORM_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \
HASH_INSTANCENORM_SUMS_KERNEL_NAME(IN0_TYPE), \
SOURCE },
#define TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_INSTANCENORM_MEAN_VARI_KEY(IN0_TYPE, OUT_TYPE, 1), \
HASH_INSTANCENORM_MEAN_VARI_KERNEL_2D_NAME(IN0_TYPE), \
#define TENSOR_INSTANCENORM_SUMS_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 1), \
HASH_INSTANCENORM_SUMS_KERNEL_2D_NAME(IN0_TYPE), \
SOURCE },
// normalization
@ -102,17 +100,15 @@ typedef struct
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] =
static const _kernel_map_type _instancenorm_sums_kernel_map[] =
{
// Register kernel here
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( U8, F32, KERNEL_SOURCE_1 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( I32, F32, KERNEL_SOURCE_3 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F32, F32, KERNEL_SOURCE_4 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 )
TENSOR_INSTANCENORM_SUMS_KERNELS( U8, F32, KERNEL_SOURCE_1 )
TENSOR_INSTANCENORM_SUMS_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
TENSOR_INSTANCENORM_SUMS_KERNELS( F32, F32, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_SUMS_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_SUMS_KERNELS( I32, F32, KERNEL_SOURCE_3 )
TENSOR_INSTANCENORM_SUMS_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
};
static const _kernel_map_type _instancenorm_kernel_map[] =
@ -123,22 +119,19 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_1 )
TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_1 )
TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_KERNELS( I32, I32, KERNEL_SOURCE_3 )
TENSOR_INSTANCENORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 )
TENSOR_INSTANCENORM_KERNELS( I32, F32, KERNEL_SOURCE_3 )
TENSOR_INSTANCENORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
TENSOR_INSTANCENORM_KERNELS( F32, F32, KERNEL_SOURCE_4 )
TENSOR_INSTANCENORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_4 )
};
/*
* Kernel params
*/
static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
static vx_param_description_t _instancenorm_sums_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@ -146,12 +139,9 @@ static vx_param_description_t _instancenorm_mean_vari_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _INSTANCENORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _instancenorm_mean_vari_kernel_param_def )
#define _INSTANCENORM_SUMS_PARAM_NUM _cnt_of_array( _instancenorm_sums_kernel_param_def )
static vx_param_description_t _instancenorm_kernel_param_def[] =
{
@ -168,10 +158,6 @@ static vx_param_description_t _instancenorm_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _INSTANCENORM_PARAM_NUM _cnt_of_array( _instancenorm_kernel_param_def )
@ -179,7 +165,7 @@ static vx_param_description_t _instancenorm_kernel_param_def[] =
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
@ -244,7 +230,7 @@ final:
attr[1] = NULL;
}
return status;
} /* _instance_normalization_mean_vari_initializer() */
} /* _instance_normalization_sums_initializer() */
DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
(
@ -334,12 +320,12 @@ static vsi_status _query_kernel
switch( kernel_id )
{
case INTERNAL_KERNEL_MEAN_VARI:
initializer = _instancenorm_mean_vari_initializer;
kernel_map = _instancenorm_mean_vari_kernel_map;
kernel_map_size = _cnt_of_array( _instancenorm_mean_vari_kernel_map );
param_def = _instancenorm_mean_vari_kernel_param_def;
param_size = _INSTANCENORM_MEAN_VARI_PARAM_NUM;
case INTERNAL_KERNEL_SUMS:
initializer = _instancenorm_sums_initializer;
kernel_map = _instancenorm_sums_kernel_map;
kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map );
param_def = _instancenorm_sums_kernel_param_def;
param_size = _INSTANCENORM_SUMS_PARAM_NUM;
break;
case INTERNAL_KERNEL_NORM:
initializer = _instancenorm_initializer;
@ -392,9 +378,9 @@ static vsi_nn_kernel_node_t _setup
)
{
#define INTERNAL_KERNEL_SIZE (1)
#define MEAN_VARI_INDEX (0)
#define SUMS_INDEX (0)
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t mean_vari_node_params[_INSTANCENORM_MEAN_VARI_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_dtype_e in0_dtype = U8;
@ -407,18 +393,17 @@ static vsi_nn_kernel_node_t _setup
uint32_t hashkey = 0;
int32_t i = 0;
uint32_t rank = outputs[0]->attr.dim_num;
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
float eps = vsi_nn_kernel_param_get_float32( params, "eps" ) /
(input_scale * input_scale);
size_t width = inputs[0]->attr.size[0];
size_t height = inputs[0]->attr.size[1];
int32_t reshape_flg = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
&& rank > 2;
int32_t group_num = (int32_t)(width + 15) / 16;
int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
int32_t output_zp = vsi_nn_get_tensor_zero_point(outputs[0]);
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
float in_fl_scale = 1.0f, out_fl_scale = 1.0;
float dim_ratio = (float)1.0 / (float)(width * height);
float inv_multiplier = (float)1.0 / (float)(width * height);
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
@ -443,15 +428,21 @@ static vsi_nn_kernel_node_t _setup
attr.size[2] = 1;
attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
attr.dim_num = 4;
tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
tensors[SUMS_INDEX] = vsi_nn_CreateTensor( graph, &attr );
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
in0_dtype = in0_dtype == I8 ? I32 : in0_dtype;
in0_dtype = in0_dtype == I16 ? I32 : in0_dtype;
out_dtype = out_dtype == F16 ? F32 : out_dtype;
out_dtype = out_dtype == I8 ? I32 : out_dtype;
out_dtype = out_dtype == I16 ? I32 : out_dtype;
hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg );
hashkeys[SUMS_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg );
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
status = _query_kernel( ikernels[SUMS_INDEX], hashkeys[SUMS_INDEX], INTERNAL_KERNEL_SUMS );
if ( VSI_SUCCESS != status )
{
goto final;
@ -497,37 +488,31 @@ static vsi_nn_kernel_node_t _setup
}
// Mean Vari
{
node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
node = vsi_nn_kernel_create_node( graph, ikernels[SUMS_INDEX] );
if (node)
{
uint32_t index = 0;
if (reshape_flg)
{
mean_vari_node_params[index++] = rs_input;
sums_node_params[index++] = rs_input;
}
else
{
mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
sums_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
}
mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale );
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
sums_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t;
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
status = vsi_nn_kernel_node_pass_param( node, mean_vari_node_params,
_INSTANCENORM_MEAN_VARI_PARAM_NUM );
status = vsi_nn_kernel_node_pass_param( node, sums_node_params,
_INSTANCENORM_SUMS_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] );
vsi_nn_kernel_scalar_release( &mean_vari_node_params[5] );
vsi_nn_kernel_scalar_release( &mean_vari_node_params[6] );
vsi_nn_kernel_scalar_release( &mean_vari_node_params[7] );
vsi_nn_kernel_scalar_release( &mean_vari_node_params[8] );
vsi_nn_kernel_scalar_release( &sums_node_params[2] );
vsi_nn_kernel_scalar_release( &sums_node_params[3] );
vsi_nn_kernel_scalar_release( &sums_node_params[4] );
vsi_nn_kernel_scalar_release( &sums_node_params[5] );
vsi_nn_kernel_node_release( &node );
}
}
@ -562,7 +547,7 @@ static vsi_nn_kernel_node_t _setup
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
}
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUMS_INDEX]->t;
if (reshape_flg)
{
node_params[index++] = rs_output;
@ -573,15 +558,11 @@ static vsi_nn_kernel_node_t _setup
}
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &in_fl_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &out_fl_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inv_multiplier );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_num );
status = vsi_nn_kernel_node_pass_param( node, node_params,
@ -595,10 +576,6 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
vsi_nn_kernel_scalar_release( &node_params[14] );
vsi_nn_kernel_scalar_release( &node_params[15] );
vsi_nn_kernel_scalar_release( &node_params[16] );
}
}

View File

@ -0,0 +1,312 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "maxpoolwithargmax"
#define KERNEL_SOURCE_2 "maxpoolwithargmax_2d"
// Add kernel hashtable here
#define MAXPOOLWITHARGMAX_HASH_KEY( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, _image_2d) \
(( IN_DTYPE << 24 ) | ( OUT_DTYPE0 << 20) | ( OUT_DTYPE1 << 12) | (_image_2d))
#define HASH_MAXPOOLWITHARGMAX_KERNELS( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \
{ MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 0), \
CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1), \
KERNEL_SOURCE_1 },
#define HASH_MAXPOOLWITHARGMAX_KERNELS_2D( IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1) \
{ MAXPOOLWITHARGMAX_HASH_KEY(IN_DTYPE, OUT_DTYPE0, OUT_DTYPE1, 1), \
CVIVANTE_NAMESPACE("cl.maxpoolwithargmax_"#IN_DTYPE"to"#OUT_DTYPE0"_"#OUT_DTYPE1"_2D"), \
KERNEL_SOURCE_2 },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} maxpoolwithargmax_map[] =
{
HASH_MAXPOOLWITHARGMAX_KERNELS(F32, F32, I32)
HASH_MAXPOOLWITHARGMAX_KERNELS(BF16, BF16, I32)
HASH_MAXPOOLWITHARGMAX_KERNELS(U32, U32, I32)
HASH_MAXPOOLWITHARGMAX_KERNELS(I32, I32, I32)
HASH_MAXPOOLWITHARGMAX_KERNELS_2D(F32, F32, I32)
HASH_MAXPOOLWITHARGMAX_KERNELS_2D(BF16, BF16, I32)
HASH_MAXPOOLWITHARGMAX_KERNELS_2D(U32, U32, I32)
HASH_MAXPOOLWITHARGMAX_KERNELS_2D(I32, I32, I32)
};
/*
* Kernel params
*/
static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _MAXPOOLWITHARGMAX_PARAM_NUM _cnt_of_array( _maxpoolwithargmax_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_maxpoolwithargmax_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vx_status status = VX_FAILURE;
vx_tensor output = (vx_tensor)param[1];
vsi_nn_kernel_tensor_attr_t * attr_out = NULL;
vsi_size_array_t * out_shape = NULL;
attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final );
out_shape = attr_out->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = out_shape->data[1];
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (attr_out)
{
vsi_nn_kernel_tensor_attr_release(&attr_out);
}
return status;
} /* _maxpoolwithargmax_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t is_2d
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input_dtype = U8;
vsi_nn_kernel_dtype_e output0_dtype = U8;
vsi_nn_kernel_dtype_e output1_dtype = I32;
uint32_t key = 0;
int32_t i = 0;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output0_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
output1_dtype = vsi_nn_kernel_map_dtype( outputs[1]->attr.dtype.vx_type );
if (input_dtype == U8)
{
input_dtype = U32;
}
if (input_dtype == I8 || input_dtype == I16)
{
input_dtype = I32;
}
if (input_dtype == F16)
{
input_dtype = F32;
}
if (output0_dtype == U8)
{
output0_dtype = U32;
}
if (output0_dtype == I8 || output0_dtype == I16)
{
output0_dtype = I32;
}
if (output0_dtype == F16)
{
output0_dtype = F32;
}
key = MAXPOOLWITHARGMAX_HASH_KEY( input_dtype, output0_dtype, output1_dtype, is_2d);
for ( i = 0; i < _cnt_of_array(maxpoolwithargmax_map); i ++ )
{
if ( maxpoolwithargmax_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(maxpoolwithargmax_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", maxpoolwithargmax_map[i].function_name );
kernel->info.parameters = _maxpoolwithargmax_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _maxpoolwithargmax_kernel_param_def );
kernel->info.initialize = _maxpoolwithargmax_initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
maxpoolwithargmax_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
maxpoolwithargmax_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x");
int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y");
int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
int32_t pad_x = vsi_nn_kernel_param_get_int32(params, "pad_left");
int32_t pad_y = vsi_nn_kernel_param_get_int32(params, "pad_top");
int32_t image_2d = inputs[0]->attr.dim_num == 2 ? 1 : 0;
int32_t width = (int32_t)inputs[0]->attr.size[0];
int32_t height = (int32_t)inputs[0]->attr.size[1];
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float scale_value = 1.0f;
float tail_value = 0.0f;
if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
inputs[0]->attr.dim_num )
|| !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num )
|| !vsi_nn_kernel_gpu_check_shape( outputs[1]->attr.size,
outputs[1]->attr.dim_num ))
{
return NULL;
}
scale_value = inputScale / outputScale;
tail_value = outputTail - inputTail * inputScale / outputScale;
status = _query_kernel( kernel, inputs, outputs, image_2d );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 3;
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_x );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_y );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale_value );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &tail_value );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( maxpoolwithargmax, _setup )

View File

@ -0,0 +1,303 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
#define MOD_KERNEL_SOURCE_NAME "mod"
#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
{ MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \
MOD_KERNEL_SOURCE_NAME},
#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
{ MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
CVIVANTE_NAMESPACE("cl.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \
MOD_KERNEL_SOURCE_NAME },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _mod_kernel_map[] =
{
// Register kernel here
MOD_KERNELS( F32, F32, F32 )
MOD_KERNELS( I32, I32, I32 )
MOD_KERNELS( I32, I32, U8 )
MOD_KERNELS( U8, U8, U8 )
MOD_KERNELS( U8, I32, U8 )
MOD_KERNELS_2D( F32, F32, F32 )
MOD_KERNELS_2D( I32, I32, I32 )
MOD_KERNELS_2D( I32, I32, U8 )
MOD_KERNELS_2D( U8, U8, U8 )
MOD_KERNELS_2D( U8, I32, U8 )
};
/*
* Kernel params
*/
static vx_param_description_t _mod_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _MOD_PARAM_NUM _cnt_of_array( _mod_kernel_param_def )
#define MOD_QUANT_PARAM_NUM _cnt_of_array( _mod_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_mod_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_status status = VSI_FAILURE;
vx_tensor output = (vx_tensor)param[2];
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
output_shape = output_attr->shape;
gpu_param.dim = output_shape->size < 3 ? 2 : 3;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1];
gpu_param.global_size[2] = output_shape->size > 2 ? output_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (output_attr)
{
vsi_nn_kernel_tensor_attr_release(&output_attr);
}
return status;
} /* _mod_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool image_2d
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in0_dtype;
vsi_nn_kernel_dtype_e in1_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _mod_kernel_map;
size_t kernel_map_size = _cnt_of_array( _mod_kernel_map );
vx_param_description_t * param_def = _mod_kernel_param_def;
size_t param_def_size = _cnt_of_array( _mod_kernel_param_def );
vx_kernel_initialize_f initializer = _mod_initializer;
uint32_t key = 0;
uint32_t i = 0;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (F16 == in0_dtype)
{
in0_dtype = F32;
}
else if (I16 == in0_dtype || I8 == in0_dtype)
{
in0_dtype = I32;
}
if (F16 == in1_dtype)
{
in1_dtype = F32;
}
else if (I16 == in1_dtype || I8 == in1_dtype)
{
in1_dtype = I32;
}
if (F16 == out_dtype)
{
out_dtype = F32;
}
else if (I16 == out_dtype || I8 == out_dtype)
{
out_dtype = I32;
}
key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_bool image_2d = FALSE;
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
outputScale = 1.0f / outputScale;
input0Tail = -(input0Tail * input0Scale);
input1Tail = -(input1Tail * input1Scale);
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
image_2d = (outputs[0]->attr.dim_num == 2);
status = _query_kernel( kernel, inputs, outputs, image_2d);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
size_t node_params_num = MOD_QUANT_PARAM_NUM;
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
node_params[4] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
node_params[5] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
node_params[7] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
node_params[9] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( mod, _setup )

View File

@ -48,7 +48,7 @@ __BEGIN_DECLS
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE ) \
{ ROI_ALIGN_HASH_KEY( IN0_DTYPE, IN1_DTYPE, IN2_DTYPE, OUT_DTYPE, 0 ), \
CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"to"STR(OUT_DTYPE)), \
CVIVANTE_NAMESPACE("cl.roi_align_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
_ROI_ALIGN_KERNEL_SOURCE(IN0_DTYPE) }
typedef struct
@ -61,6 +61,7 @@ typedef struct
static const _kernel_map_type _roi_align_kernel_map[] =
{
PACK_KERNEL_MAP(F32, F32, I32, F32),
PACK_KERNEL_MAP(U8, U16, I32, U8),
};
@ -82,20 +83,28 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _ROI_ALIGN_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def )
#define SCALAR_SPATIAL_X_SCALE (4)
#define SCALAR_SPATIAL_Y_SCALE (5)
#define SCALAR_INPUT_WIDTH (6)
#define SCALAR_INPUT_HEIGHT (7)
#define SCALAR_RCP_OF_OUTPUT_WIDTH (8)
#define SCALAR_RCP_OF_OUTPUT_HEIGHT (9)
#define SCALAR_SAMPLING_X_RATIO (10)
#define SCALAR_SAMPLING_Y_RATIO (11)
#define SCALAR_DEPTH (12)
#define SCALAR_INPUT_SCALE (4)
#define SCALAR_INPUT_TAIL (5)
#define SCALAR_OUTPUT_SCALE (6)
#define SCALAR_OUTPUT_ZP (7)
#define SCALAR_SPATIAL_X_SCALE (8)
#define SCALAR_SPATIAL_Y_SCALE (9)
#define SCALAR_INPUT_WIDTH (10)
#define SCALAR_INPUT_HEIGHT (11)
#define SCALAR_RCP_OF_OUTPUT_WIDTH (12)
#define SCALAR_RCP_OF_OUTPUT_HEIGHT (13)
#define SCALAR_SAMPLING_X_RATIO (14)
#define SCALAR_SAMPLING_Y_RATIO (15)
#define SCALAR_DEPTH (16)
#define ROI_ALIGN_PARAM_NUM 13
#define ROI_ALIGN_PARAM_NUM 17
#define ROI_ALIGN_QUANT_PARAM_NUM _cnt_of_array( _roi_align_kernel_param_def )
/*
@ -185,6 +194,7 @@ static vsi_status _query_kernel
in0_dtype = in0_dtype == F16 ? F32 : in0_dtype;
in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
out_dtype = out_dtype == F16 ? F32 : out_dtype;
key = ROI_ALIGN_HASH_KEY( in0_dtype, in1_dtype, in2_dtype, out_dtype, image_2d );
@ -241,8 +251,14 @@ static vsi_nn_kernel_node_t _setup
float height_ratio = vsi_nn_kernel_param_get_float32( params, "height_ratio" );
int32_t width_sample_num = vsi_nn_kernel_param_get_int32( params, "width_sample_num" );
int32_t height_sample_num = vsi_nn_kernel_param_get_int32( params, "height_sample_num" );
float width_scale = 1.0f / width_ratio;
float height_scale = 1.0f / height_ratio;
float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
float input_tail = -(input_zp * input_scale);
float roi_scale = vsi_nn_get_tensor_scale(inputs[1]);
float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float width_scale = roi_scale / width_ratio;
float height_scale = roi_scale / height_ratio;
float in_width = (float)(inputs[0]->attr.size[0]);
float in_height = (float)(inputs[0]->attr.size[1]);
float rcp_of_out_width = 1.0f / (float)(outputs[0]->attr.size[0]);
@ -287,6 +303,10 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_pack_io( node_params, _ROI_ALIGN_PARAM_NUM,
reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create( graph, F32, &input_tail );
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
node_params[SCALAR_SPATIAL_X_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &width_scale );
node_params[SCALAR_SPATIAL_Y_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &height_scale );
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &in_width );
@ -299,6 +319,10 @@ static vsi_nn_kernel_node_t _setup
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_X_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_SPATIAL_Y_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );

View File

@ -115,7 +115,7 @@ static vsi_status cal_scatter_nd_tensor_reshape_size
return status;
}
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
newDim[0] = 0;
for(i = 0; i < dims_num; ++i)
@ -333,4 +333,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( scatter_nd, _setup )

View File

@ -108,7 +108,7 @@ static vsi_status cal_scatter_nd_update_tensor_reshape_size
return status;
}
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
newDim[0] = 0;
for(i = 0; i < dims_num; ++i)
@ -373,4 +373,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( scatter_nd_update, _setup )

View File

@ -22,7 +22,6 @@
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -49,6 +48,13 @@ __BEGIN_DECLS
CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
_TOPK_KERNEL_SOURCE }
#define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
#define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
CVIVANTE_NAMESPACE("cl.topk_odd_even_sort_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
"topk_odd_even_sort" }
typedef struct
{
uint32_t key;
@ -84,6 +90,14 @@ static const _kernel_map_type _topk_kernel_map[] =
PACK_KERNEL_MAP( I32, I32, 6 ),
};
static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
{
// Register kernel here
PACK_ODD_EVEN_SORT_KERNEL_MAP( F32, F32 ),
PACK_ODD_EVEN_SORT_KERNEL_MAP( U32, U32 ),
PACK_ODD_EVEN_SORT_KERNEL_MAP( I32, I32 ),
};
/*
* Kernel params
*/
@ -99,6 +113,19 @@ static vx_param_description_t _topk_kernel_param_def[] =
#define _TOPK_PARAM_NUM _cnt_of_array( _topk_kernel_param_def )
#define SCALAR_INPUT_NUM_STAGES (3)
#define SCALAR_INPUT_WIDTH (4)
static vx_param_description_t _topk_odd_even_sort_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _TOPK_ODD_EVEN_SORT_PARAM_NUM _cnt_of_array( _topk_odd_even_sort_kernel_param_def )
#define SCALAR_INPUT_SIZE (5)
/*
* Kernel initializer
*/
@ -140,9 +167,47 @@ DEF_KERNEL_INITIALIZER(_topk_initializer)
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(input_attr);
#undef SAFE_FREE_TENSOR_ATTR
return status;
} /* _topk_initializer() */
DEF_KERNEL_INITIALIZER(_topk_odd_even_sort_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
2,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * in_shape = NULL;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
in_shape = input_attr->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.local_size[0] = 32;
gpu_param.local_size[1] = 1;
gpu_param.global_size[0] = 32;
gpu_param.global_size[1] = in_shape->data[1];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(input_attr);
#undef SAFE_FREE_TENSOR_ATTR
return status;
} /* _topk_odd_even_sort_initializer() */
/*
* Query kernel
@ -215,6 +280,72 @@ static vsi_status _query_kernel
return status;
} /* _query_kernel() */
static vsi_status _query_odd_even_sort_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _topk_odd_even_sort_kernel_map;
size_t kernel_map_size = _cnt_of_array( _topk_odd_even_sort_kernel_map );
vx_param_description_t * param_def = _topk_odd_even_sort_kernel_param_def;
vx_kernel_initialize_f initializer = _topk_odd_even_sort_initializer;
#define _PACK_SELECT_KEY( in_type, out_type ) \
( (in_type) | (out_type << 8) )
uint32_t key = 0;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
{
case _PACK_SELECT_KEY(F32, F32):
case _PACK_SELECT_KEY(F16, F16):
key = TOPK_ODD_EVEN_SORT_HASH_KEY( F32, F32 );
break;
case _PACK_SELECT_KEY(U32, U32):
case _PACK_SELECT_KEY(U16, U16):
case _PACK_SELECT_KEY(U8, U8):
key = TOPK_ODD_EVEN_SORT_HASH_KEY( U32, U32 );
break;
case _PACK_SELECT_KEY(I32, I32):
case _PACK_SELECT_KEY(I16, I16):
case _PACK_SELECT_KEY(I8, I8):
key = TOPK_ODD_EVEN_SORT_HASH_KEY( I32, I32 );
break;
default:
break;
}
#undef _PACK_SELECT_KEY
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _topk_odd_even_sort_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
@ -228,16 +359,19 @@ static vsi_nn_kernel_node_t _setup
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
vsi_nn_kernel_node_param_t node_params[_TOPK_ODD_EVEN_SORT_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
vsi_size_t block_size = inputs[0]->attr.size[0];
vsi_size_t block_num = 1;
uint32_t i = 0;
vsi_nn_tensor_t* rs_tensors[3] = { NULL };
vsi_nn_tensor_t* rs_tensors[5] = { NULL };
vsi_nn_tensor_attr_t attr;
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
int32_t width = (int32_t)block_size;
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
vsi_bool is_odd_even_sort = FALSE;
size_t param_num = _TOPK_PARAM_NUM;
for (i = 1; i < inputs[0]->attr.dim_num; i ++)
{
@ -257,26 +391,58 @@ static vsi_nn_kernel_node_t _setup
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shape[0], 2 );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shape[1], 2 );
rs_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[1], shape[1], 2 );
status = _query_kernel( kernel, inputs, outputs, num_stages );
if (num_stages < 7)
{
status = _query_kernel( kernel, inputs, outputs, num_stages );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shape[1], 2 );
rs_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[1], shape[1], 2 );
}
else
{
status = _query_odd_even_sort_kernel( kernel, inputs, outputs );
is_odd_even_sort = TRUE;
param_num = _TOPK_ODD_EVEN_SORT_PARAM_NUM;
memcpy( &attr, &(rs_tensors[0]->attr), sizeof(vsi_nn_tensor_attr_t) );
rs_tensors[1] = vsi_nn_CreateTensor( graph, &attr );
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
rs_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
rs_tensors[3] = vsi_nn_reshape_tensor( graph,
outputs[0], shape[1], 2 );
rs_tensors[4] = vsi_nn_reshape_tensor( graph,
outputs[1], shape[1], 2 );
input_num = 3;
}
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
rs_tensors, input_num, &rs_tensors[1], output_num );
vsi_nn_kernel_node_pack_io( node_params, param_num,
rs_tensors, input_num, &rs_tensors[input_num], output_num );
/* Pass parameters to node. */
node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
graph, I32, &num_stages );
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
graph, I32, &width );
status = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
if (is_odd_even_sort)
{
node_params[SCALAR_INPUT_SIZE] = vsi_nn_kernel_scalar_create(
graph, I32, &width );
}
else
{
node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
graph, I32, &num_stages );
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
graph, I32, &width );
}
status = vsi_nn_kernel_node_pass_param( node, node_params, param_num );
CHECK_STATUS_FAIL_GOTO( status, final );
}
}
@ -284,13 +450,25 @@ final:
vsi_safe_release_tensor(rs_tensors[0]);
vsi_safe_release_tensor(rs_tensors[1]);
vsi_safe_release_tensor(rs_tensors[2]);
if (node_params[SCALAR_INPUT_NUM_STAGES])
vsi_safe_release_tensor(rs_tensors[3]);
vsi_safe_release_tensor(rs_tensors[4]);
if (is_odd_even_sort)
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
if (node_params[SCALAR_INPUT_SIZE])
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SIZE] );
}
}
if (node_params[SCALAR_INPUT_WIDTH])
else
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
if (node_params[SCALAR_INPUT_NUM_STAGES])
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
}
if (node_params[SCALAR_INPUT_WIDTH])
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
}
}
return node;

View File

@ -0,0 +1,260 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _CPU_ARG_NUM (3)
#define _CPU_INPUT_NUM (1)
#define _CPU_OUTPUT_NUM (1)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.cumsum")
DEF_KERNEL_EXECUTOR(_cumsum_exec)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VX_FAILURE;
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
float * buffer[2] = { NULL };
size_t out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
int32_t i = 0;
int32_t axisSize = 1, innerSize = 1, outerSize = 1;
int32_t axis = 0, exclusive = 0, reverse = 0;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &exclusive);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
CHECK_STATUS_FAIL_GOTO(status, final );
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
buffer[1] = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
memset( buffer[1], 0, out_elements * sizeof(float) );
{
int32_t dims_num = (int32_t)attr[1]->shape->size;
int32_t inner = 0;
int32_t outer = 0;
for(i = 0; i < axis; ++i)
{
innerSize *= (int32_t)attr[0]->shape->data[i];
}
axisSize = (int32_t)attr[0]->shape->data[i++];
for(; i < dims_num; ++i)
{
outerSize *= (int32_t)attr[0]->shape->data[i];
}
for ( outer = 0; outer < outerSize; ++outer)
{
for ( inner = 0; inner < innerSize; ++inner)
{
float sum = .0f;
if (exclusive && reverse)
{
int32_t idx_out = (outer * axisSize + axisSize - 1) * innerSize + inner;
buffer[1][idx_out] = sum;
for (i = axisSize - 1; i > 0; i--)
{
int32_t idx = (outer * axisSize + i) * innerSize + inner;
float value = buffer[0][idx];
idx_out = (outer * axisSize + i - 1) * innerSize + inner;
sum += value;
buffer[1][idx_out] = sum;
}
}
else if (exclusive)
{
int32_t idx_out = outer * axisSize * innerSize + inner;
buffer[1][idx_out] = sum;
for (i = 0; i < axisSize - 1; ++i)
{
int32_t idx = (outer * axisSize + i) * innerSize + inner;
float value = buffer[0][idx];
idx_out = (outer * axisSize + i + 1) * innerSize + inner;
sum += value;
buffer[1][idx_out] = sum;
}
}
else if (reverse)
{
for (i = axisSize - 1; i >= 0; i--)
{
int32_t idx = (outer * axisSize + i) * innerSize + inner;
float value = buffer[0][idx];
sum += value;
buffer[1][idx] = sum;
}
}
else
{
for (i = 0; i < axisSize; ++i)
{
// i * innerSize + inner + outer * innerSize * axisSize
int32_t idx = (outer * axisSize + i) * innerSize + inner;
float value = buffer[0][idx];
sum += value;
buffer[1][idx] = sum;
}
}
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
buffer[1], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
for ( i = 0; i < 2; i ++ )
{
if ( buffer[i] )
{
free( buffer[i] );
}
}
for ( i = 0; i < _CPU_IO_NUM; i ++ )
{
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
}
return status;
} /* _cumsum_exec() */
/*
* Kernel params
*/
static vx_param_description_t _cumsum_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _CUMSUM_PARAM_NUM _cnt_of_array( _cumsum_kernel_param_def )
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel
)
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _cumsum_exec;
kernel->info.parameters = _cumsum_kernel_param_def;
kernel->info.numParams = _CUMSUM_PARAM_NUM;
return VSI_SUCCESS;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VX_FAILURE;
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
status = _query_kernel( inputs, outputs, kernel );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 2;
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
CHECK_STATUS( status );
vsi_nn_kernel_scalar_release( &backend_params[2] );
vsi_nn_kernel_scalar_release( &backend_params[3] );
vsi_nn_kernel_scalar_release( &backend_params[4] );
}
else
{
status = VSI_FAILURE;
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( cumsum, _setup )

View File

@ -50,6 +50,9 @@ typedef enum
UNARY_HGELU,
UNARY_SELU,
UNARY_CELU,
UNARY_RCP,
UNARY_SIGN,
UNARY_SOFTSIGN,
} unary_type_e;
@ -145,6 +148,21 @@ static float celu_eval(float x, float alpha)
return positive + negative;
}
static float rcp_eval(float x)
{
return 1 / x;
}
static float sign_eval(float x)
{
return x > 0 ? 1.0f : x < 0 ? -1.0f : 0;
}
static float softsign_eval(float x)
{
return x / (1.0f + vsi_abs(x));
}
DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
(
vsi_nn_kernel_node_t node,
@ -227,6 +245,15 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
case UNARY_CELU:
data = celu_eval(data, alpha);
break;
case UNARY_RCP:
data = rcp_eval(data);
break;
case UNARY_SIGN:
data = sign_eval(data);
break;
case UNARY_SOFTSIGN:
data = softsign_eval(data);
break;
default:
break;
}
@ -360,4 +387,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu, UNARY_GELU )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu, UNARY_HGELU )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( selu, UNARY_SELU )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu, UNARY_CELU )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( celu, UNARY_CELU )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( rcp, UNARY_RCP )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( sign, UNARY_SIGN )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( softsign, UNARY_SOFTSIGN )

View File

@ -0,0 +1,284 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _CPU_ARG_NUM (8)
#define _CPU_INPUT_NUM (1)
#define _CPU_OUTPUT_NUM (2)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.maxpoolwithargmax")
#define FP32_MIN -3.4e38
/*
* Kernel params
*/
static vx_param_description_t _maxpoolwithargmax_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
// Add kererl parameters here
};
#define _MAXPOOLWITHARGMAX_PARAM_NUM _cnt_of_array( _maxpoolwithargmax_kernel_param_def )
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_maxpoolwithargmax_exec)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VX_FAILURE;
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
float * buffer[_CPU_IO_NUM] = { NULL };
size_t out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0;
int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0;
int32_t i = 0;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_x);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &ksize_y);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &stride_x);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &stride_y);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_left);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_right);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_top);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_bottom);
CHECK_STATUS_FAIL_GOTO(status, final );
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
buffer[1] = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
memset( buffer[1], 0, out_elements * sizeof(float) );
buffer[2] = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
memset( buffer[2], 0, out_elements * sizeof(float) );
{
int32_t dims_num = (int32_t)attr[1]->shape->size;
int32_t batch = dims_num > 3 ? (int32_t)attr[1]->shape->data[3] : 1;
int32_t depth = dims_num > 2 ? (int32_t)attr[1]->shape->data[2] : 1;
int32_t height_o = (int32_t)attr[1]->shape->data[1];
int32_t width_o = (int32_t)attr[1]->shape->data[0];
int32_t width = (int32_t)attr[0]->shape->data[0];
int32_t height = (int32_t)attr[0]->shape->data[1];
int32_t b = 0, d = 0, j = 0;
int32_t output_base = 0;
int32_t input_base = 0;
for (b = 0; b < batch; b++)
{
for (d = 0; d < depth; d++)
{
output_base = b * depth * height_o * width_o + d * height_o * width_o;
input_base = b * depth * height * width + d * height * width;
for (j = 0; j < height_o; j++)
{
for (i = 0; i < width_o; i++)
{
int32_t hstart = j * stride_y - pad_top;
int32_t wstart = i * stride_x - pad_left;
int32_t hend = vsi_nn_min(hstart + ksize_y, height);
int32_t wend = vsi_nn_min(wstart + ksize_x, width);
int32_t pool_index = output_base + j * width_o + i;
int32_t h = 0, w = 0;
int32_t index_max = 0;
float value_max = (float)FP32_MIN;
hstart = vsi_nn_max(hstart, 0);
wstart = vsi_nn_max(wstart, 0);
for (h = hstart; h < hend; ++ h)
{
for (w = wstart; w < wend; ++ w)
{
int32_t index = input_base + h * width + w;
float data = buffer[0][index];
if (data > value_max)
{
value_max = data;
index_max = index;
}
}
}
buffer[1][pool_index] = value_max;
buffer[2][pool_index] = (float)index_max;
}
}
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
buffer[1], out_elements );
status |= vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
buffer[2], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
for ( i = 0; i < _CPU_IO_NUM; i ++ )
{
if ( buffer[i] )
{
free( buffer[i] );
}
}
for ( i = 0; i < _CPU_IO_NUM; i ++ )
{
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
}
return status;
} /* _maxpoolwithargmax_exec() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _maxpoolwithargmax_exec;
kernel->info.parameters = _maxpoolwithargmax_kernel_param_def;
kernel->info.numParams = _MAXPOOLWITHARGMAX_PARAM_NUM;
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_MAXPOOLWITHARGMAX_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
int32_t ksize_x = vsi_nn_kernel_param_get_int32(params, "ksize_x");
int32_t ksize_y = vsi_nn_kernel_param_get_int32(params, "ksize_y");
int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
int32_t pad_left = vsi_nn_kernel_param_get_int32(params, "pad_left");
int32_t pad_right = vsi_nn_kernel_param_get_int32(params, "pad_right");
int32_t pad_top = vsi_nn_kernel_param_get_int32(params, "pad_top");
int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
status = _query_kernel( kernel, inputs, outputs );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
int32_t index = 3;
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _MAXPOOLWITHARGMAX_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _MAXPOOLWITHARGMAX_PARAM_NUM );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( maxpoolwithargmax, _setup )

View File

@ -0,0 +1,247 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (2)
#define _OUTPUT_NUM (1)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.mod")
/*
* Kernel params
*/
static vx_param_description_t _mod_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _MOD_PARAM_NUM _cnt_of_array( _mod_kernel_param_def )
static vsi_ssize_t _expand_offset
(
vsi_ssize_t index,
vsi_size_t * shape, vsi_size_t rank,
vsi_size_t * strides, vsi_size_t * out_shape
)
{
vsi_size_t i;
vsi_ssize_t offset = 0;
for( i = 0; i < rank && index; i ++ )
{
if( shape[i] == out_shape[i] )
{
offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
}
index /= out_shape[i];
}
return offset;
}
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
int32_t isfmod = 0;
vsi_nn_kernel_dtype_e input0_dtype = F16;
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
float* f32_in_buffer[_INPUT_NUM] = {NULL};
float* f32_out_buffer[_OUTPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_attr_t* in_attr[_INPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_attr_t* out_attr[_OUTPUT_NUM] = {NULL};
vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
vsi_size_t out_elements[_OUTPUT_NUM] = {0};
vsi_size_t out_bytes[_OUTPUT_NUM] = {0};
uint32_t i;
/* prepare data */
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &isfmod);
for (i = 0; i < _INPUT_NUM; i++) {
input[i] = (vsi_nn_kernel_tensor_t)param[i];
in_attr[i] = vsi_nn_kernel_tensor_attr_create(input[i]);
vsi_nn_kernel_tensor_attr_get_stride(in_attr[i], in_stride_size[i]);
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer(input[i], in_attr[i], TRUE);
CHECK_PTR_FAIL_GOTO(f32_in_buffer[i], "Create input0 buffer fail.", final);
}
input0_dtype = in_attr[0]->dtype;
if (input0_dtype == F16 || input0_dtype == F32 || input0_dtype == BF16) {
isfmod = 1;
}
for (i = 0; i < _OUTPUT_NUM; i++)
{
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
out_attr[i] = vsi_nn_kernel_tensor_attr_create(output[i]);
vsi_nn_kernel_tensor_attr_get_stride(out_attr[i], out_stride_size[i]);
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size(out_attr[i]);
out_bytes[i] = out_elements[i] * sizeof(float);
f32_out_buffer[i] = (float*)malloc(out_bytes[i]);
CHECK_PTR_FAIL_GOTO(f32_out_buffer[i], "Create output buffer fail.", final);
memset(f32_out_buffer[i], 0, out_bytes[i]);
}
for (i = 0; i < out_elements[0]; i++)
{
vsi_ssize_t in0_offset = 0;
vsi_ssize_t in1_offset = 0;
float in0 = 0;
float in1 = 0;
in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size,
in_stride_size[0], out_attr[0]->shape->data );
in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size,
in_stride_size[1], out_attr[0]->shape->data );
in0 = f32_in_buffer[0][in0_offset];
in1 = f32_in_buffer[1][in1_offset];
if (isfmod)
{
f32_out_buffer[0][i] = (float)fmod(in0,in1);
}
else
{
f32_out_buffer[0][i] = in0 - in1 * (float)floor(in0 / in1);
}
}
/* save data */
for (i = 0; i < _OUTPUT_NUM; i++) {
status = vsi_nn_kernel_tensor_write_from_float(
output[i], out_attr[i], f32_out_buffer[i], out_elements[i]);
CHECK_STATUS_FAIL_GOTO(status, final);
}
final:
for (i = 0; i < _INPUT_NUM; i++) {
if (f32_in_buffer[i]) {
free(f32_in_buffer[i]);
f32_in_buffer[i] = NULL;
}
if (in_attr[i]) {
vsi_nn_kernel_tensor_attr_release(&in_attr[i]);
}
}
for (i = 0; i < _OUTPUT_NUM; i++) {
if (f32_out_buffer[i]) {
free(f32_out_buffer[i]);
f32_out_buffer[i] = NULL;
}
if (out_attr[i]) {
vsi_nn_kernel_tensor_attr_release(&out_attr[i]);
}
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
)
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _mod_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _mod_kernel_param_def );
return VSI_SUCCESS;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[3] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( mod, _setup )

View File

@ -55,8 +55,8 @@ __BEGIN_DECLS
static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@ -90,12 +90,16 @@ DEF_KERNEL_EXECUTOR(_compute)
uint32_t i = 0;
int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
float mean[3] = {0}, scale = 1;
vsi_bool is_rgb888 = tensors[1] == NULL;
for (i = 0; i < _CPU_IO_NUM; i++)
{
tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
if (tensors[i])
{
attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
CHECK_PTR_FAIL_GOTO( attr[i], "Create tensor attr buffer fail.", final );
}
}
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
@ -113,8 +117,11 @@ DEF_KERNEL_EXECUTOR(_compute)
for (i = 0; i < 3; i++)
{
buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
if (tensors[i])
{
buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[i], "Create input0 buffer fail.", final );
}
buffer[i + 3] = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer[i + 3], "Create output buffer fail.", final );
@ -125,12 +132,17 @@ DEF_KERNEL_EXECUTOR(_compute)
int32_t line1[2], line2[2];
int32_t dx = 0, dy = 0, idx = 0;
int32_t src_width = (int32_t)attr[0]->shape->data[0];
int32_t src_height = (int32_t)attr[0]->shape->data[1];
int32_t dst_width = (int32_t)attr[3]->shape->data[0];
int32_t dst_height = (int32_t)attr[3]->shape->data[1];
uint8_t result = 0;
int32_t offset = 0;
int32_t index = 0;
for ( idx = 0; idx < 3; idx ++)
{
offset = is_rgb888 ? idx * src_width * src_height : 0;
index = is_rgb888 ? 0 : idx;
for ( dy = 0; dy < (int32_t)dst_height; dy ++)
{
for ( dx = 0; dx < (int32_t)dst_width; dx ++)
@ -170,10 +182,10 @@ DEF_KERNEL_EXECUTOR(_compute)
sy += yOffset;
source_index = (sx + sy * src_width);
line1[0] = (int32_t)buffer[idx][source_index];
line1[1] = (int32_t)buffer[idx][source_index + 1];
line2[0] = (int32_t)buffer[idx][source_index + src_width];
line2[1] = (int32_t)buffer[idx][source_index + src_width + 1];
line1[0] = (int32_t)buffer[index][source_index + offset];
line1[1] = (int32_t)buffer[index][source_index + 1 + offset];
line2[0] = (int32_t)buffer[index][source_index + src_width + offset];
line2[1] = (int32_t)buffer[index][source_index + src_width + 1 + offset];
temp1 = fx * (line1[1] - line1[0]) + (line1[0] << 10);
temp2 = fx * (line2[1] - line2[0]) + (line2[0] << 10);
@ -184,10 +196,10 @@ DEF_KERNEL_EXECUTOR(_compute)
}
else
{
int32_t offset = xOffset + yOffset * src_width;
source_index = dx + dy * src_width + offset;
finalVal = (buffer[0][source_index] - mean[idx]) * scale;
buffer[1][output_index] = finalVal;
int32_t ofset = xOffset + yOffset * src_width;
source_index = dx + dy * src_width + ofset + offset;
finalVal = (buffer[index][source_index] - mean[idx]) * scale;
buffer[idx + 3][output_index] = finalVal;
}
}
}

View File

@ -209,16 +209,15 @@ DEF_KERNEL_EXECUTOR(_compute)
for (n = 0; n < num_rois; n++)
{
uint32_t batchId = (uint32_t)f32_in_buffer[2][n];
float scale = (in_attr[1]->dtype == U16) ? 0.125f : 1.0f;
float qx1 = f32_in_buffer[1][n * kRoiDim];
float qy1 = f32_in_buffer[1][n * kRoiDim + 1];
float qx2 = f32_in_buffer[1][n * kRoiDim + 2];
float qy2 = f32_in_buffer[1][n * kRoiDim + 3];
float x1 = qx1 * scale;
float x2 = qx2 * scale;
float y1 = qy1 * scale;
float y2 = qy2 * scale;
float x1 = qx1;
float x2 = qx2;
float y1 = qy1;
float y2 = qy2;
float roi_anchor_x = x1 * width_scale;
float roi_anchor_y = y1 * height_scale;
float roi_dims_x = vsi_nn_max((x2 - x1) * width_scale, 1.0f);

View File

@ -0,0 +1,770 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "cumsum"
#define KERNEL_SOURCE_2 "cumsum_2d"
#define KERNEL_SOURCE_3 "cumsum_bf16"
#define KERNEL_SOURCE_4 "cumsum_f16_u8"
// Add kernel hashtable here
#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
#define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
SOURCE },
#define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} cumsum_map[] =
{
HASH_CUMSUM_KERNELS(0, U8, U8, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(0, I8, I8, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(0, I16, I16, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(0, F16, F16, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(0, BF16, BF16, KERNEL_SOURCE_3)
HASH_CUMSUM_KERNELS(1, U8, U8, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(1, I8, I8, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(1, I16, I16, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(1, F16, F16, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(1, BF16, BF16, KERNEL_SOURCE_3)
HASH_CUMSUM_KERNELS(2, U8, U8, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(2, I8, I8, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(2, I16, I16, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(2, F16, F16, KERNEL_SOURCE_1)
HASH_CUMSUM_KERNELS(2, BF16, BF16, KERNEL_SOURCE_3)
HASH_CUMSUM_KERNELS_2D(0, U8, U8, KERNEL_SOURCE_2)
HASH_CUMSUM_KERNELS_2D(0, I8, I8, KERNEL_SOURCE_2)
HASH_CUMSUM_KERNELS_2D(0, I16, I16, KERNEL_SOURCE_2)
HASH_CUMSUM_KERNELS_2D(0, F16, F16, KERNEL_SOURCE_2)
HASH_CUMSUM_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_3)
HASH_CUMSUM_KERNELS_2D(1, U8, U8, KERNEL_SOURCE_2)
HASH_CUMSUM_KERNELS_2D(1, I8, I8, KERNEL_SOURCE_2)
HASH_CUMSUM_KERNELS_2D(1, I16, I16, KERNEL_SOURCE_2)
HASH_CUMSUM_KERNELS_2D(1, F16, F16, KERNEL_SOURCE_2)
HASH_CUMSUM_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_3)
HASH_CUMSUM_KERNELS(0, F16, U8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS(0, F16, I8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS(0, F16, I16, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS(1, F16, U8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS(1, F16, I8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS(1, F16, I16, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS(2, F16, U8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS(2, F16, I8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS(2, F16, I16, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS_2D(0, F16, U8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS_2D(0, F16, I8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS_2D(0, F16, I16, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS_2D(1, F16, U8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS_2D(1, F16, I8, KERNEL_SOURCE_4)
HASH_CUMSUM_KERNELS_2D(1, F16, I16, KERNEL_SOURCE_4)
};
/*
* Kernel params
*/
static vx_param_description_t _cumsum_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _CUMSUM_PARAM_NUM _cnt_of_array( _cumsum_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_cumsum_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
int32_t axis = 0;
int32_t width = 0;
int32_t height = 0;
int32_t channel = 0;
int32_t w = 1;
int32_t h = 1;
int32_t c = 1;
uint32_t dim = 1;
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
vsi_size_array_t * input_shape = NULL;
int32_t input_zp = 0;
float input_scale = 1.0f;
float output_zp = 0;
float output_scale = 1.0f;
float in_out_zp_scale = 1.0f;
float in_out_scale = 1.0f;
uint32_t pack_key = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
CHECK_STATUS_FAIL_GOTO(status, OnError );
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input_scale = attr[0]->asymm.scale;
input_zp = attr[0]->asymm.zero_point;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[1]->dfp.fl > 0)
{
output_scale = (float)((int64_t)1 << attr[1]->dfp.fl);
}
else
{
output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
output_scale = 1.0f / attr[1]->asymm.scale;
output_zp = (float)attr[1]->asymm.zero_point;
}
in_out_scale = input_scale * output_scale;
in_out_zp_scale = (float)in_out_scale * input_zp;
input_shape = attr[0]->shape;
dim = (uint32_t)input_shape->size;
width = (int32_t)(input_shape->data[0]);
height = (int32_t)(input_shape->data[1]);
channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
if (axis == 0)
{
w = 1;
h = height;
c = channel;
}
else if (axis == 1)
{
w = width;
h = 1;
c = channel;
}
else if (axis == 2)
{
w = width;
h = height;
c = 1;
}
shaderParam.global_scale[0] = 8;
if ((attr[0]->dtype == U8 || attr[0]->dtype == I8)
&& (axis > 0))
{
shaderParam.global_scale[0] = 16;
}
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0];
shaderParam.global_size[1] = h;
shaderParam.global_size[2] = c;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM) \
(IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24))
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, axis, dim);
{
uint16_t M0 = 0;
int32_t postShift = 0;
uint32_t multAndoutZP0[2] = {0};
gpu_dp_inst_t uniU8MulAndPostShift_0_Lo_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x13121110, 0x17161514, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAccSumVertF16toF16_2x8 = {{
0x55555555, // TCfg
0x44444444, // ASelt
0x33221100, 0x77665544, // ABin
0xaaaaaaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001,
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAccSumVertU8toI32A_4x4 = {{
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00110000, 0x00330022, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAccSumVertU8toI32B_4x4 = {{
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00150004, 0x00370026, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAccSumVertU8toI32C_4x4 = {{
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00190008, 0x003b002a, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAccSumVertU8toI32D_4x4 = {{
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x001d000c, 0x003f002e, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniSumHorzF16toF16A_4x4 = {{
0x55150501, // TCfg
0x00000000, // ASelt
0x00100000, 0x32100210, // ABin
0xaa2a0a02, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x3c003c00, 0x00000000,
0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniSumHorzF16toF16B_4x4 = {{
0x55150501, // TCfg
0x00000000, // ASelt
0x00540004, 0x76540654, // ABin
0xaa2a0a02, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x3c003c00, 0x00000000,
0x3c003c00, 0x00003c00, 0x3c003c00, 0x3c003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniSumHorzF16toF16C_2x8 = {{
0x55551111, // TCfg
0x00000000, // ASelt
0x03020100, 0x37363534, // ABin
0xaaaa2222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAccSumHorzF16toF16_2x8 = {{
0x55555555, // TCfg
0x44444444, // ASelt
0x73727170, 0x77767574, // ABin
0xaaaaaaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00,
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniSumHorzU8toI16A_4x4 = {{
0x55150501, // TCfg
0x00000000, // ASelt
0x00100000, 0x32100210, // ABin
0xaa2a0a02, // BSelt
0x00000000, 0x00000000, // BBin
0x00000700, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000001, 0x00010001, 0x00010001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniSumHorzU8toI16B_8x4 = {{
0x05550155, 0x55551555, // TCfg
0x00418820, 0x41882000, 0x8820000a, 0x20018a41, 0x398a4188, // BinSelect
0x00000700, // AccumType, ConstantType, and PostShift
0x01010101, 0x00000001, 0x01010101, 0x00000101,
0x01010101, 0x00010101, 0x01010101, 0x01010101 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniSubZpI16toI16_2x8 = {{
0x99999999, // TCfg
0x44444444, // ASelt
0x03020100, 0x07060504, // ABin
0xaaaaaaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00020001, 0x00030001, 0x00040001,
0x00050001, 0x00060001, 0x00070001, 0x00080001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAccSumHorzI16toI32A_4x4 = {{
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00310030, 0x00330032, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniAccSumHorzI16toI32B_4x4 = {{
0x0d0d0d0d, // TCfg
0x04040404, // ASelt
0x00350034, 0x00370036, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniSetZeroF16_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_quantize_multiplier_16bit( (double)input_scale * output_scale, &M0, &postShift);
multAndoutZP0[0] = (uint32_t)(M0);
multAndoutZP0[1] = (uint32_t)((attr[1]->asymm.zero_point << postShift) - input_zp * M0);
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
CHECK_STATUS_FAIL_GOTO(status, OnError );
switch( pack_key )
{
case _PACK_SELECT_KEY( U8, U8, 2, 3):
case _PACK_SELECT_KEY( I8, I8, 2, 3):
case _PACK_SELECT_KEY( I16, I16, 2, 3):
case _PACK_SELECT_KEY( F16, F16, 2, 3):
{
status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
status |= vsi_nn_kernel_gpu_add_param(
node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U8, U8, 0, 2):
case _PACK_SELECT_KEY( U8, U8, 1, 2):
case _PACK_SELECT_KEY( U8, U8, 0, 3):
case _PACK_SELECT_KEY( U8, U8, 1, 3):
case _PACK_SELECT_KEY( I8, I8, 0, 2):
case _PACK_SELECT_KEY( I8, I8, 1, 2):
case _PACK_SELECT_KEY( I8, I8, 0, 3):
case _PACK_SELECT_KEY( I8, I8, 1, 3):
case _PACK_SELECT_KEY( I16, I16, 0, 2):
case _PACK_SELECT_KEY( I16, I16, 1, 2):
case _PACK_SELECT_KEY( I16, I16, 0, 3):
case _PACK_SELECT_KEY( I16, I16, 1, 3):
case _PACK_SELECT_KEY( F16, F16, 0, 2):
case _PACK_SELECT_KEY( F16, F16, 1, 2):
case _PACK_SELECT_KEY( F16, F16, 0, 3):
case _PACK_SELECT_KEY( F16, F16, 1, 3):
{
status = vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_scale", &in_out_scale);
status |= vsi_nn_kernel_gpu_add_param(node, "in_out_zp_scale", &in_out_zp_scale);
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertU8toI32A_4x4", &uniAccSumVertU8toI32A_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertU8toI32B_4x4", &uniAccSumVertU8toI32B_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertU8toI32C_4x4", &uniAccSumVertU8toI32C_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumVertU8toI32D_4x4", &uniAccSumVertU8toI32D_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzU8toI16A_4x4", &uniSumHorzU8toI16A_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumHorzU8toI16B_8x4", &uniSumHorzU8toI16B_8x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSubZpI16toI16_2x8", &uniSubZpI16toI16_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumHorzI16toI32A_4x4", &uniAccSumHorzI16toI32A_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniAccSumHorzI16toI32B_4x4", &uniAccSumHorzI16toI32B_4x4 );
status |= vsi_nn_kernel_gpu_add_param(
node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, BF16, 0, 2):
case _PACK_SELECT_KEY( BF16, BF16, 1, 2):
case _PACK_SELECT_KEY( BF16, BF16, 0, 3):
case _PACK_SELECT_KEY( BF16, BF16, 1, 3):
case _PACK_SELECT_KEY( BF16, BF16, 2, 3):
{
status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniExtractOddData_2x8", &uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( F16, U8, 0, 2):
case _PACK_SELECT_KEY( F16, U8, 1, 2):
case _PACK_SELECT_KEY( F16, U8, 0, 3):
case _PACK_SELECT_KEY( F16, U8, 1, 3):
case _PACK_SELECT_KEY( F16, U8, 2, 3):
case _PACK_SELECT_KEY( F16, I8, 0, 2):
case _PACK_SELECT_KEY( F16, I8, 1, 2):
case _PACK_SELECT_KEY( F16, I8, 0, 3):
case _PACK_SELECT_KEY( F16, I8, 1, 3):
case _PACK_SELECT_KEY( F16, I8, 2, 3):
case _PACK_SELECT_KEY( F16, I16, 0, 2):
case _PACK_SELECT_KEY( F16, I16, 1, 2):
case _PACK_SELECT_KEY( F16, I16, 0, 3):
case _PACK_SELECT_KEY( F16, I16, 1, 3):
case _PACK_SELECT_KEY( F16, I16, 2, 3):
{
status = vsi_nn_kernel_gpu_add_param(node, "channel", &channel);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniAccSumVertF16toF16_2x8", &uniAccSumVertF16toF16_2x8);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniSumHorzF16toF16A_4x4", &uniSumHorzF16toF16A_4x4);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniSumHorzF16toF16B_4x4", &uniSumHorzF16toF16B_4x4);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniSumHorzF16toF16C_2x8", &uniSumHorzF16toF16C_2x8);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniAccSumHorzF16toF16_2x8", &uniAccSumHorzF16toF16_2x8);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8);
status |= vsi_nn_kernel_gpu_add_param(
node, "multAndoutZP0", &multAndoutZP0);
status |= vsi_nn_kernel_gpu_add_param(
node, "uniSetZeroF16_2x8", &uniSetZeroF16_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
default:
break;
}
}
#undef _PACK_SELECT_KEY
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
}
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
const vsi_nn_kernel_param_t * params,
int32_t axis,
int32_t is_2d
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
for( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
{
if ( cumsum_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(cumsum_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", cumsum_map[i].function_name );
kernel->info.parameters = _cumsum_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _cumsum_kernel_param_def );
kernel->info.initialize = _cumsum_initializer;
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
cumsum_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
cumsum_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
vsi_size_t shapes[1][VSI_NN_MAX_DIM_NUM] = {{0}};
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
int32_t exclusive = vsi_nn_kernel_param_get_int32( params, "exclusive" );
int32_t reverse = vsi_nn_kernel_param_get_int32( params, "reverse" );
int32_t axis_new = 0;
int32_t is_2d = 0;
uint32_t rs_dim = 2;
int32_t i = 0;
vsi_nn_kernel_optimize_softmax_shape(
inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
shapes[0], &rs_dim, &axis_new);
if (exclusive || reverse || rs_dim > 3)
{
return NULL;
}
if (rs_dim == 2)
{
is_2d = 1;
}
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], (vsi_size_t)rs_dim );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[0], (vsi_size_t)rs_dim );
status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 2;
/* Pass parameters to node. */
vsi_nn_kernel_node_pack_io( tmp_params, _CUMSUM_PARAM_NUM,
reshape_tensors, 1, &reshape_tensors[1], 1 );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_new );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &exclusive );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _CUMSUM_PARAM_NUM );
vsi_nn_kernel_scalar_release( &tmp_params[2] );
vsi_nn_kernel_scalar_release( &tmp_params[3] );
vsi_nn_kernel_scalar_release( &tmp_params[4] );
}
}
for (i = 0; i < 2; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( cumsum, _setup )

View File

@ -53,6 +53,9 @@ typedef enum
UNARY_HGELU,
UNARY_SELU,
UNARY_CELU,
UNARY_RCP,
UNARY_SIGN,
UNARY_SOFTSIGN,
} unary_type_e;
/*
@ -94,6 +97,34 @@ typedef enum
#define HGELU_OPERATION hard_gelu
#define SELU_OPERATION selu
#define CELU_OPERATION celu
#define RCP_OPERATION rcp
#define SIGN_OPERATION sign
#define SOFTSIGN_OPERATION softsign
#define ADD_UNARY_SH_KERNELS(name, source) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, F16, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, F16, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, I16, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, I16, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, U8, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, U8, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F16, I8, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, F16, I8, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16, I16, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16, I16, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I16, F16, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I16, F16, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8, I8, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8, I8, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, I8, F16, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, I8, F16, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8, U8, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8, U8, source##_2D) \
TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, U8, F16, source##_3D) \
TENSOR_UNARY_KERNELS_2D(name##_OPERATION, UNARY_##name, U8, F16, source##_2D) \
static const struct {
uint32_t key;
@ -101,269 +132,22 @@ static const struct {
const char* source_name;
} _eltwise_unary_evis_kernel_map[] =
{
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, F16, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, U8, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, U8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I8, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, I8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SIN_OPERATION, UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, F16, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, U8, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, U8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I8, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, I8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(COS_OPERATION, UNARY_COS, BF16, BF16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, F16, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, U8, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, U8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I8, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, I8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(EXP_OPERATION, UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, F16, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, U8, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, U8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I8, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, I8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(LOG_OPERATION, UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, F16, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, U8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, I8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, F16, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, U8, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, U8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I8, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, BF16, BF16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, F16, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16, I16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I16, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8, U8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, U8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8, I8, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, I8, F16, KERNEL_SOURCE1_3D)
TENSOR_UNARY_KERNELS_3D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_3D)
ADD_UNARY_SH_KERNELS(SIN, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(COS, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(EXP, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(LOG, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(SELU, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(CELU, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(NEG, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(RCP, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(SIGN, KERNEL_SOURCE1)
ADD_UNARY_SH_KERNELS(SOFTSIGN, KERNEL_SOURCE1)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, F16, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, BF16, BF16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, BF16, BF16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, I8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, BF16, BF16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, F16, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, I8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, BF16, BF16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, F16, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, U8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, I8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(SELU_OPERATION, UNARY_SELU, BF16, BF16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, F16, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, BF16, BF16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, F16, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16, I16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I16, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8, U8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, U8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8, I8, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, I8, F16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_2D(CELU_OPERATION, UNARY_CELU, BF16, BF16, KERNEL_SOURCE1_2D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, U8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, I16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, I8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, I8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, BF16, BF16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, I16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, U8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, F16, I8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16, I16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I16, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8, U8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, U8, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8, I8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, I8, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, F16, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, I16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, U8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, F16, I8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16, I16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I16, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8, U8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, U8, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8, I8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, I8, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, I16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, U8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, F16, I8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16, I16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I16, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8, U8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, U8, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8, I8, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, I8, F16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_3D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_3D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, I16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, I8, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, F16, KERNEL_SOURCE0_2D)
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16, KERNEL_SOURCE0_2D)
ADD_UNARY_SH_KERNELS(HSIGMOID, KERNEL_SOURCE0)
ADD_UNARY_SH_KERNELS(MISH, KERNEL_SOURCE0)
ADD_UNARY_SH_KERNELS(ROUND, KERNEL_SOURCE0)
ADD_UNARY_SH_KERNELS(GELU, KERNEL_SOURCE0)
ADD_UNARY_SH_KERNELS(HGELU, KERNEL_SOURCE0)
};
#undef SIN_OPERATION
@ -378,6 +162,9 @@ static const struct {
#undef GELU_OPERATION
#undef HGELU_OPERATION
#undef CELU_OPERATION
#undef RCP_OPERATION
#undef SIGN_OPERATION
#undef SOFTSIGN_OPERATION
/*
* Kernel params
*/
@ -509,6 +296,9 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_SOFTSIGN, BF16, BF16 ):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
@ -815,5 +605,8 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sign, UNARY_SIGN )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( softsign, UNARY_SOFTSIGN )
__END_DECLS

View File

@ -222,7 +222,7 @@ static vsi_status get_gather_tensor_reshape_size
uint32_t i = 0;
vsi_size_t elementCnt = 1;
vsi_size_t outerCnt = 1;
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
for(i = 0; i < dims_num - batch_dims; ++i)
{
@ -751,7 +751,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_t * kernel
)
{
#define VSI_NN_MAX_BLOCK_SIZE (65536)
#define VSI_NN_MAX_BLOCK_SIZE GPU_TENSOR_MAX_WIDTH
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
@ -795,12 +795,6 @@ static vsi_nn_kernel_node_t _setup
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[2], rs_dim );
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array, is_batch);
if ( VSI_SUCCESS == status)
{

View File

@ -136,7 +136,7 @@ static vsi_status get_gather_nd_tensor_reshape_size
vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t i = 0;
vsi_size_t elementCnt = 1;
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
newDim[0] = 0;
for(i = 0; i < dims_num; ++i)

View File

@ -44,7 +44,7 @@ __BEGIN_DECLS
typedef enum _grucell_nn_activation_type_e
{
SIGMOID = VSI_NN_ACT_SIGMOID,
HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
HSIGMOID = VSI_NN_ACT_HARD_SIGMOID,
}grucell_nn_activation_type_e;
#define _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE "grucell_activation_z_h"
@ -72,6 +72,10 @@ static const _kernel_map_type _grucell_activation_z_h_kernel_map[] =
PACK_KERNEL_MAP( I8, F16, I8, SIGMOID ),
PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
PACK_KERNEL_MAP( U8, F16, U8, HSIGMOID ),
PACK_KERNEL_MAP( I8, F16, I8, HSIGMOID ),
PACK_KERNEL_MAP( I16, F16, I16, HSIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
};
/*

View File

@ -22,7 +22,6 @@
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -45,7 +44,7 @@ __BEGIN_DECLS
typedef enum _grucell_nn_activation_type_e
{
SIGMOID = VSI_NN_ACT_SIGMOID,
HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
HSIGMOID = VSI_NN_ACT_HARD_SIGMOID,
}grucell_nn_activation_type_e;
#define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE "grucell_h_times_activation_r"
@ -72,9 +71,12 @@ static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] =
PACK_KERNEL_MAP( I8, F16, F16, SIGMOID ),
PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
PACK_KERNEL_MAP( U8, F16, F16, HSIGMOID ),
PACK_KERNEL_MAP( I8, F16, F16, HSIGMOID ),
PACK_KERNEL_MAP( I16, F16, F16, HSIGMOID ),
PACK_KERNEL_MAP( F16, F16, F16, HSIGMOID ),
};
/*
* Kernel params
*/
@ -256,8 +258,6 @@ final:
return status;
} /* _grucell_h_times_activation_r_initializer() */
/*
* Query kernel
*/
@ -313,7 +313,6 @@ static vsi_status _query_kernel
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,

View File

@ -38,16 +38,24 @@
__BEGIN_DECLS
#define HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, _image_2d) \
((AXIS << 28) | (IN1_DTYPE << 20) | (IN0_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
#define HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) \
"l2normalizescale_axis"#AXIS
#define KERNEL_SOURCE_1 "l2normalizescale_axis0"
#define KERNEL_SOURCE_2 "l2normalizescale_axis0_2d"
#define KERNEL_SOURCE_3 "l2normalizescale_axis1"
#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
#define HASH_L2NORMALIZESCALE_KERNELS_2D( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D"), \
HASH_L2NORMALIZESCALE_KERNEL_SOURCE_NAME(AXIS) },
SOURCE },
#define HASH_L2NORMALIZESCALE_KERNELS( AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE) \
{ HASH_L2NORMALIZESCALE_HASH_KEY(AXIS, IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
CVIVANTE_NAMESPACE("evis.l2normalizescale_axis"#AXIS"_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE), \
SOURCE },
typedef struct
{
@ -58,20 +66,27 @@ typedef struct
static const _kernel_map_type _l2normalizescale_kernel_map[] =
{
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, F16, F16, F16, KERNEL_SOURCE_2 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, I8, KERNEL_SOURCE_2 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I8 , F16, F16, KERNEL_SOURCE_2 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, U8, KERNEL_SOURCE_2 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, U8 , F16, F16, KERNEL_SOURCE_2 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, I16, KERNEL_SOURCE_2 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 0, I16, F16, F16, KERNEL_SOURCE_2 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, F16, F16, F16, KERNEL_SOURCE_3 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, I8, KERNEL_SOURCE_3 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I8 , F16, F16, KERNEL_SOURCE_3 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, U8, KERNEL_SOURCE_3 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, U8 , F16, F16, KERNEL_SOURCE_3 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, I16, KERNEL_SOURCE_3 )
HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16, KERNEL_SOURCE_3 )
HASH_L2NORMALIZESCALE_KERNELS( 0, F16, F16, F16, KERNEL_SOURCE_1 )
HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, I8, KERNEL_SOURCE_1 )
HASH_L2NORMALIZESCALE_KERNELS( 0, I8 , F16, F16, KERNEL_SOURCE_1 )
HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, U8, KERNEL_SOURCE_1 )
HASH_L2NORMALIZESCALE_KERNELS( 0, U8 , F16, F16, KERNEL_SOURCE_1 )
HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, I16, KERNEL_SOURCE_1 )
HASH_L2NORMALIZESCALE_KERNELS( 0, I16, F16, F16, KERNEL_SOURCE_1 )
};
/*
@ -119,6 +134,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
int32_t outputZP = 0;
float outputScale = 1.0f;
float r_inputScale = 1.0f;
float e2InScale = 1.0f;
float inOutScale = 1.0f;
int32_t axis2Dflg = 0;
int32_t inputWidth = 0;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@ -168,7 +187,10 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
outputScale = 1.0f / output_attr->asymm.scale;
}
e2InScale = inputScale * inputScale;
r_inputScale = 1.0f / inputScale;
inOutScale = inputScale * outputScale;
inputWidth = (int32_t)(output_shape->data[0]);
if (1 == axis)
{
@ -190,6 +212,13 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
gpu_param.local_size[1] = 1;
gpu_param.global_size[0] = 16;
gpu_param.global_size[1] = output_shape->data[1];
if (output_shape->data[0] < GPU_TENSOR_MAX_WIDTH
&& output_shape->data[1] < GPU_TENSOR_MAX_WIDTH
&& (output_shape->size == 2 || (output_shape->size == 3 && output_shape->data[2] == 1)))
{
axis2Dflg = 1;
}
}
else
{
@ -257,8 +286,105 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
0x55555555, // TCfg
0x00000000, // ASelt
0x76543210, 0x76543210, // ABin
0x5555aaaa, // BSelt
0x00000000, 0x76543210, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertSecFp16Fp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
0x00010000, 0x00030002, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
0x05050505, // TCfg
0x04040404, // ASelt
0x00050004, 0x00070006, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
0x55555555, // TCfg
0x00000000, // ASelt
0x76543210, 0x76543210, // ABin
0x5555aaaa, // BSelt
0x00000000, 0x76543210, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
if (1 == axis)
if (axis2Dflg)
{
float zP2x = 2 * (float)inputZP;
float zpSqr8x = 8 * (float)inputZP * (float)inputZP;
float output_ZP = (float)outputZP;
status = vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth);
status |= vsi_nn_kernel_gpu_add_param( node, "zP2x", &zP2x);
status |= vsi_nn_kernel_gpu_add_param( node, "zpSqr8x", &zpSqr8x);
status |= vsi_nn_kernel_gpu_add_param( node, "e2InScale", &e2InScale);
status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale);
status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP);
status |= vsi_nn_kernel_gpu_add_param( node, "inputZP", &inputZP);
status |= vsi_nn_kernel_gpu_add_param( node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
&uniConvert1stUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
&uniConvert2ndUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (1 == axis)
{
int32_t L2NorS_depth = (int32_t)(output_shape->data[1]);
status = vsi_nn_kernel_gpu_add_param( node, "L2NorS_depth", &L2NorS_depth);
@ -277,8 +403,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
}
else if (0 == axis)
{
int32_t inputWidth, inputWidthCount, inputWidthRemain256;
inputWidth = (int32_t)(output_shape->data[0]);
int32_t inputWidthCount, inputWidthRemain256;
inputWidthRemain256 = (int32_t)(output_shape->data[0] % 256);
inputWidthCount = (int32_t)(output_shape->data[0] / 256);
vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth);
@ -298,7 +423,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
}
}
{
if (axis2Dflg == 0)
{
float IntergerScale = inputScale;
float output_ZP = (float)outputZP;
gpu_dp_inst_t uniExtact8Bin_2x8 = {{
@ -473,7 +599,8 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1) &&
(inputs[0]->attr.size[0] < GPU_TENSOR_MAX_WIDTH && inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH);
status = _query_kernel( kernel, inputs, outputs, axis, image_2d );
if ( VSI_SUCCESS == status)
{

File diff suppressed because it is too large Load Diff

View File

@ -910,6 +910,7 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
status |= vsi_nn_kernel_gpu_add_param( node,
"uniGemmU8U8MulZptoFp32_8x4", &uniGemmU8U8MulZptoFp32_8x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "input01Scale", &inScaleMul );
status |= vsi_nn_kernel_gpu_add_param( node, "mulKIn0In1Zp", &mulKIn0In1Zp );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;

View File

@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = (uint8_t)attr[2]->dfp.fl;
int32_t fl = attr[2]->dfp.fl;
if (fl > 0)
{
output_scale = (float) ((int64_t)1 << fl);

View File

@ -202,7 +202,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = (uint8_t)attr[2]->dfp.fl;
int32_t fl = attr[2]->dfp.fl;
if (fl > 0)
{
output_scale = (float) ((int64_t)1 << fl);

View File

@ -0,0 +1,444 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
#define MOD_HASH_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
#define MOD_KERNEL_SOURCE_NAME "mod"
#define MOD_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
{ MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE), \
MOD_KERNEL_SOURCE_NAME },
#define MOD_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
{ MOD_HASH_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
CVIVANTE_NAMESPACE("evis.mod_"#IN0_TYPE#IN1_TYPE"to"#OUT_TYPE"_2D"), \
MOD_KERNEL_SOURCE_NAME },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _mod_kernel_map[] =
{
// Register kernel here
MOD_KERNELS( F16, F16, F16 )
MOD_KERNELS( F16, F16, I16 )
MOD_KERNELS( F16, F16, I8 )
MOD_KERNELS( F16, F16, U8 )
MOD_KERNELS( I16, I16, I16 )
MOD_KERNELS( I8, I8, I8 )
MOD_KERNELS( U8, U8, U8 )
MOD_KERNELS( I16, I16, F16 )
MOD_KERNELS( I8, I8, F16 )
MOD_KERNELS( U8, U8, F16 )
MOD_KERNELS( BF16, BF16, BF16 )
MOD_KERNELS_2D( F16, F16, F16 )
MOD_KERNELS_2D( F16, F16, I16 )
MOD_KERNELS_2D( F16, F16, I8 )
MOD_KERNELS_2D( F16, F16, U8 )
MOD_KERNELS_2D( I16, I16, I16 )
MOD_KERNELS_2D( I8, I8, I8 )
MOD_KERNELS_2D( U8, U8, U8 )
MOD_KERNELS_2D( I16, I16, F16 )
MOD_KERNELS_2D( I8, I8, F16 )
MOD_KERNELS_2D( U8, U8, F16 )
MOD_KERNELS_2D( BF16, BF16, BF16 )
};
/*
* Kernel params
*/
static vx_param_description_t _mod_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _MOD_PARAM_NUM _cnt_of_array( _mod_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_mod_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vx_status status = VX_FAILURE;
vx_tensor input0 = (vx_tensor)param[0];
vx_tensor input1 = (vx_tensor)param[1];
vx_tensor output = (vx_tensor)param[2];
vsi_nn_kernel_tensor_attr_t *input0_attr = NULL;
vsi_nn_kernel_tensor_attr_t *input1_attr = NULL;
vsi_nn_kernel_tensor_attr_t *output_attr = NULL;
vsi_size_array_t *output_shape = NULL;
vsi_nn_kernel_dtype_e input0_dtype = F16;
int32_t input0_fl = 0;
int32_t input1_fl = 0;
int32_t output_fl = 0;
float inScale0 = 1.0f;
float inScale1 = 1.0f;
float outScale = 1.0f;
float in0Tail = 0;
float in1Tail = 0;
float outZp = 0;
input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0 );
CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input1 );
CHECK_PTR_FAIL_GOTO( input1_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
output_shape = output_attr->shape;
input0_dtype = input0_attr->dtype;
gpu_param.dim = output_shape->size < 3 ? 2 : 3;
gpu_param.global_offset[0] = 0;
gpu_param.global_offset[1] = 0;
gpu_param.global_offset[2] = 0;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1];
gpu_param.global_size[2] = output_shape->size > 2 ?
(output_shape->data[2] + gpu_param.global_scale[2] - 1)
/ gpu_param.global_scale[2] : 1;
if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
input0_fl = input0_attr->dfp.fl;
if (input0_fl > 0)
{
inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
}
else
{
inScale0 = (float)((int64_t)1 << -input0_fl);
}
}
else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
inScale0 = input0_attr->asymm.scale;
in0Tail = -inScale0 * ((float)input0_attr->asymm.zero_point);
}
if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
input1_fl = input1_attr->dfp.fl;
if (input1_fl > 0)
{
inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
}
else
{
inScale1 = (float)((int64_t)1 << -input1_fl);
}
}
else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
inScale1 = input1_attr->asymm.scale;
in1Tail = -inScale1 * ((float)input1_attr->asymm.zero_point);
}
if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
{
output_fl = output_attr->dfp.fl;
if (output_fl > 0)
{
outScale = (float) ((int64_t)1 << output_fl);
}
else
{
outScale = 1.0f / (float)((int64_t)1 << -output_fl);
}
}
else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outScale = 1.0f / output_attr->asymm.scale;
outZp = (float)(output_attr->asymm.zero_point);
}
if (BF16 == input0_dtype)
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node, "in_scale0", &inScale0 );
status |= vsi_nn_kernel_gpu_add_param( node, "in0Tail", &in0Tail );
status |= vsi_nn_kernel_gpu_add_param( node, "in_scale1", &inScale1 );
status |= vsi_nn_kernel_gpu_add_param( node, "in1Tail", &in1Tail );
status |= vsi_nn_kernel_gpu_add_param( node, "out_scale", &outScale );
status |= vsi_nn_kernel_gpu_add_param( node, "out_zp", &outZp );
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (input0_attr)
{
vsi_nn_kernel_tensor_attr_release(&input0_attr);
}
if (input1_attr)
{
vsi_nn_kernel_tensor_attr_release(&input1_attr);
}
if (output_attr)
{
vsi_nn_kernel_tensor_attr_release(&output_attr);
}
return status;
} /* _mod_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool image_2d
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in0_dtype;
vsi_nn_kernel_dtype_e in1_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _mod_kernel_map;
size_t kernel_map_size = _cnt_of_array( _mod_kernel_map );
vx_param_description_t * param_def = _mod_kernel_param_def;
size_t param_def_size = _cnt_of_array( _mod_kernel_param_def );
vx_kernel_initialize_f initializer = _mod_initializer;
uint32_t key = 0;
uint32_t i = 0;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = MOD_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
for (i = 0; i < kernel_map_size; i ++)
{
if (kernel_map[i].key == key)
{
break;
}
}
if (i < kernel_map_size)
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_MOD_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_bool image_2d = FALSE;
int32_t isfmod = vsi_nn_kernel_param_get_int32(params, "isfmod");
if (!vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ))
{
return NULL;
}
image_2d = (outputs[0]->attr.dim_num == 2 || outputs[0]->attr.size[2] == 1);
if (vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == F16 ||
vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == F16 ||
vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type) == BF16 ||
vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type) == BF16)
{
isfmod = 1;
}
status = _query_kernel( kernel, inputs, outputs, image_2d);
if (VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if (node)
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _MOD_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &isfmod );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _MOD_PARAM_NUM );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[3] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( mod, _setup )

View File

@ -38,69 +38,20 @@
__BEGIN_DECLS
#define VX_KERNEL_NAME_POW_F16F16TOF16 CVIVANTE_NAMESPACE("evis.pow_F16F16toF16")
#define VX_KERNEL_NAME_POW_F16F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toF16_2D")
#define VX_KERNEL_NAME_POW_F16F16TOU8 CVIVANTE_NAMESPACE("evis.pow_F16F16toU8")
#define VX_KERNEL_NAME_POW_F16F16TOU8_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toU8_2D")
#define VX_KERNEL_NAME_POW_F16F16TOI8 CVIVANTE_NAMESPACE("evis.pow_F16F16toI8")
#define VX_KERNEL_NAME_POW_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toI8_2D")
#define VX_KERNEL_NAME_POW_F16F16TOI16 CVIVANTE_NAMESPACE("evis.pow_F16F16toI16")
#define VX_KERNEL_NAME_POW_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_F16F16toI16_2D")
#define VX_KERNEL_NAME_POW_F16U8TOF16 CVIVANTE_NAMESPACE("evis.pow_F16U8toF16")
#define VX_KERNEL_NAME_POW_F16U8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16U8toF16_2D")
#define VX_KERNEL_NAME_POW_F16I8TOF16 CVIVANTE_NAMESPACE("evis.pow_F16I8toF16")
#define VX_KERNEL_NAME_POW_F16I8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16I8toF16_2D")
#define VX_KERNEL_NAME_POW_F16I16TOF16 CVIVANTE_NAMESPACE("evis.pow_F16I16toF16")
#define VX_KERNEL_NAME_POW_F16I16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_F16I16toF16_2D")
#define VX_KERNEL_NAME_POW_F16U8TOU8 CVIVANTE_NAMESPACE("evis.pow_F16U8toU8")
#define VX_KERNEL_NAME_POW_F16U8TOU8_2D CVIVANTE_NAMESPACE("evis.pow_F16U8toU8_2D")
#define VX_KERNEL_NAME_POW_F16I8TOI8 CVIVANTE_NAMESPACE("evis.pow_F16I8toI8")
#define VX_KERNEL_NAME_POW_F16I8TOI8_2D CVIVANTE_NAMESPACE("evis.pow_F16I8toI8_2D")
#define VX_KERNEL_NAME_POW_F16I16TOI16 CVIVANTE_NAMESPACE("evis.pow_F16I16toI16")
#define VX_KERNEL_NAME_POW_F16I16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_F16I16toI16_2D")
#define VX_KERNEL_NAME_POW_U8F16TOF16 CVIVANTE_NAMESPACE("evis.pow_U8F16toF16")
#define VX_KERNEL_NAME_POW_U8F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_U8F16toF16_2D")
#define VX_KERNEL_NAME_POW_I8F16TOF16 CVIVANTE_NAMESPACE("evis.pow_I8F16toF16")
#define VX_KERNEL_NAME_POW_I8F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_I8F16toF16_2D")
#define VX_KERNEL_NAME_POW_I16F16TOF16 CVIVANTE_NAMESPACE("evis.pow_I16F16toF16")
#define VX_KERNEL_NAME_POW_I16F16TOF16_2D CVIVANTE_NAMESPACE("evis.pow_I16F16toF16_2D")
#define VX_KERNEL_NAME_POW_U8F16TOU8 CVIVANTE_NAMESPACE("evis.pow_U8F16toU8")
#define VX_KERNEL_NAME_POW_U8F16TOU8_2D CVIVANTE_NAMESPACE("evis.pow_U8F16toU8_2D")
#define VX_KERNEL_NAME_POW_I8F16TOI8 CVIVANTE_NAMESPACE("evis.pow_I8F16toI8")
#define VX_KERNEL_NAME_POW_I8F16TOI8_2D CVIVANTE_NAMESPACE("evis.pow_I8F16toI8_2D")
#define VX_KERNEL_NAME_POW_I16F16TOI16 CVIVANTE_NAMESPACE("evis.pow_I16F16toI16")
#define VX_KERNEL_NAME_POW_I16F16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_I16F16toI16_2D")
#define VX_KERNEL_NAME_POW_U8U8TOU8 CVIVANTE_NAMESPACE("evis.pow_U8U8toU8")
#define VX_KERNEL_NAME_POW_U8U8TOU8_2D CVIVANTE_NAMESPACE("evis.pow_U8U8toU8_2D")
#define VX_KERNEL_NAME_POW_I8I8TOI8 CVIVANTE_NAMESPACE("evis.pow_I8I8toI8")
#define VX_KERNEL_NAME_POW_I8I8TOI8_2D CVIVANTE_NAMESPACE("evis.pow_I8I8toI8_2D")
#define VX_KERNEL_NAME_POW_I16I16TOI16 CVIVANTE_NAMESPACE("evis.pow_I16I16toI16")
#define VX_KERNEL_NAME_POW_I16I16TOI16_2D CVIVANTE_NAMESPACE("evis.pow_I16I16toI16_2D")
#define VX_KERNEL_NAME_POW_BF16BF16TOBF16 CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16")
#define VX_KERNEL_NAME_POW_BF16BF16TOBF16_2D CVIVANTE_NAMESPACE("evis.pow_BF16BF16toBF16_2D")
#define VX_KERNEL_NAME_POW_U8U8TOF16 CVIVANTE_NAMESPACE("evis.pow_U8U8toF16")
#define VX_KERNEL_NAME_POW_U8U8TOF16_2D CVIVANTE_NAMESPACE("evis.pow_U8U8toF16_2D")
#define KERNEL_SOURCE_1 "pow_fp16",
#define KERNEL_SOURCE_2 "pow_fp16_i8",
#define KERNEL_SOURCE_3 "pow_fp16_i16",
#define KERNEL_SOURCE_4 "pow_u8",
#define KERNEL_SOURCE_5 "pow_i8",
#define KERNEL_SOURCE_6 "pow_i16"
#define KERNEL_SOURCE "pow",
#define HASH_POW_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
#define TENSOR_POW_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
{ HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE, \
SOURCE },
CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE), \
KERNEL_SOURCE },
#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
#define TENSOR_POW_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
{ HASH_POW_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
VX_KERNEL_NAME_POW_##IN0_TYPE##IN1_TYPE##TO##OUT_TYPE##_2D, \
SOURCE },
CVIVANTE_NAMESPACE("evis.pow_"#IN0_TYPE"_"#IN1_TYPE"to"#OUT_TYPE"_2D"), \
KERNEL_SOURCE },
static const struct {
uint32_t key;
@ -108,59 +59,59 @@ static const struct {
const char* source_name;
} pow_map[] =
{
TENSOR_POW_KERNELS(F16, F16, F16, KERNEL_SOURCE_1)
TENSOR_POW_KERNELS(F16, F16, U8, KERNEL_SOURCE_1)
TENSOR_POW_KERNELS(F16, U8, F16, KERNEL_SOURCE_1)
TENSOR_POW_KERNELS(F16, U8, U8, KERNEL_SOURCE_1)
TENSOR_POW_KERNELS(F16, F16, F16)
TENSOR_POW_KERNELS(F16, F16, U8)
TENSOR_POW_KERNELS(F16, U8, F16)
TENSOR_POW_KERNELS(F16, U8, U8)
TENSOR_POW_KERNELS(F16, F16, I8, KERNEL_SOURCE_2)
TENSOR_POW_KERNELS(F16, I8, F16, KERNEL_SOURCE_2)
TENSOR_POW_KERNELS(F16, I8, I8, KERNEL_SOURCE_2)
TENSOR_POW_KERNELS(F16, F16, I8)
TENSOR_POW_KERNELS(F16, I8, F16)
TENSOR_POW_KERNELS(F16, I8, I8)
TENSOR_POW_KERNELS(F16, F16, I16, KERNEL_SOURCE_3)
TENSOR_POW_KERNELS(F16, I16, F16, KERNEL_SOURCE_3)
TENSOR_POW_KERNELS(F16, I16, I16, KERNEL_SOURCE_3)
TENSOR_POW_KERNELS(F16, F16, I16)
TENSOR_POW_KERNELS(F16, I16, F16)
TENSOR_POW_KERNELS(F16, I16, I16)
TENSOR_POW_KERNELS(U8, F16, F16, KERNEL_SOURCE_4)
TENSOR_POW_KERNELS(U8, F16, U8, KERNEL_SOURCE_4)
TENSOR_POW_KERNELS(U8, U8, U8, KERNEL_SOURCE_4)
TENSOR_POW_KERNELS(U8, U8, F16, KERNEL_SOURCE_4)
TENSOR_POW_KERNELS(U8, F16, F16)
TENSOR_POW_KERNELS(U8, F16, U8)
TENSOR_POW_KERNELS(U8, U8, U8)
TENSOR_POW_KERNELS(U8, U8, F16)
TENSOR_POW_KERNELS(I8, F16, F16, KERNEL_SOURCE_5)
TENSOR_POW_KERNELS(I8, F16, I8, KERNEL_SOURCE_5)
TENSOR_POW_KERNELS(I8, I8, I8, KERNEL_SOURCE_5)
TENSOR_POW_KERNELS(I8, F16, F16)
TENSOR_POW_KERNELS(I8, F16, I8)
TENSOR_POW_KERNELS(I8, I8, I8)
TENSOR_POW_KERNELS(I16, F16, F16, KERNEL_SOURCE_6)
TENSOR_POW_KERNELS(I16, F16, I16, KERNEL_SOURCE_6)
TENSOR_POW_KERNELS(I16, I16, I16, KERNEL_SOURCE_6)
TENSOR_POW_KERNELS(BF16, BF16, BF16, KERNEL_SOURCE_3)
TENSOR_POW_KERNELS(I16, F16, F16)
TENSOR_POW_KERNELS(I16, F16, I16)
TENSOR_POW_KERNELS(I16, I16, I16)
TENSOR_POW_KERNELS(BF16, BF16, BF16)
TENSOR_POW_KERNELS_2D(F16, F16, F16, KERNEL_SOURCE_1)
TENSOR_POW_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_1)
TENSOR_POW_KERNELS_2D(F16, U8, F16, KERNEL_SOURCE_1)
TENSOR_POW_KERNELS_2D(F16, U8, U8, KERNEL_SOURCE_1)
TENSOR_POW_KERNELS_2D(F16, F16, F16)
TENSOR_POW_KERNELS_2D(F16, F16, U8)
TENSOR_POW_KERNELS_2D(F16, U8, F16)
TENSOR_POW_KERNELS_2D(F16, U8, U8)
TENSOR_POW_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_2)
TENSOR_POW_KERNELS_2D(F16, I8, F16, KERNEL_SOURCE_2)
TENSOR_POW_KERNELS_2D(F16, I8, I8, KERNEL_SOURCE_2)
TENSOR_POW_KERNELS_2D(F16, F16, I8)
TENSOR_POW_KERNELS_2D(F16, I8, F16)
TENSOR_POW_KERNELS_2D(F16, I8, I8)
TENSOR_POW_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3)
TENSOR_POW_KERNELS_2D(F16, I16, F16, KERNEL_SOURCE_3)
TENSOR_POW_KERNELS_2D(F16, I16, I16, KERNEL_SOURCE_3)
TENSOR_POW_KERNELS_2D(F16, F16, I16)
TENSOR_POW_KERNELS_2D(F16, I16, F16)
TENSOR_POW_KERNELS_2D(F16, I16, I16)
TENSOR_POW_KERNELS_2D(U8, F16, F16, KERNEL_SOURCE_4)
TENSOR_POW_KERNELS_2D(U8, F16, U8, KERNEL_SOURCE_4)
TENSOR_POW_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_4)
TENSOR_POW_KERNELS_2D(U8, U8, F16, KERNEL_SOURCE_4)
TENSOR_POW_KERNELS_2D(U8, F16, F16)
TENSOR_POW_KERNELS_2D(U8, F16, U8)
TENSOR_POW_KERNELS_2D(U8, U8, U8)
TENSOR_POW_KERNELS_2D(U8, U8, F16)
TENSOR_POW_KERNELS_2D(I8, F16, F16, KERNEL_SOURCE_5)
TENSOR_POW_KERNELS_2D(I8, F16, I8, KERNEL_SOURCE_5)
TENSOR_POW_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_5)
TENSOR_POW_KERNELS_2D(I8, F16, F16)
TENSOR_POW_KERNELS_2D(I8, F16, I8)
TENSOR_POW_KERNELS_2D(I8, I8, I8)
TENSOR_POW_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_6)
TENSOR_POW_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_6)
TENSOR_POW_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_6)
TENSOR_POW_KERNELS_2D(BF16, BF16, BF16, KERNEL_SOURCE_3)
TENSOR_POW_KERNELS_2D(I16, F16, F16)
TENSOR_POW_KERNELS_2D(I16, F16, I16)
TENSOR_POW_KERNELS_2D(I16, I16, I16)
TENSOR_POW_KERNELS_2D(BF16, BF16, BF16)
};
static vx_param_description_t vxPowKernel_param_def[] =
@ -186,24 +137,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
int8_t in0_fl = 0;
int32_t src0ZP = 0;
float src0Scale = 1.0f;
int8_t in1_fl = 0;
int32_t src1ZP = 0;
float src1Scale = 1.0f;
int8_t out_fl = 0;
float dstZP = 0;
float dstScale = 1.0f;
float input0_scale = 1.0f;
float input1_scale = 1.0f;
float input0_tail = 0;
float input1_tail = 0;
float output_scale = 1.0f;
float output_zp = 0;
int32_t postshift0 = 0;
int32_t postshift1 = 0;
float outScale_fl = 1;
uint16_t M0 = 0;
uint16_t M1 = 0;
vsi_size_t zAx = 1;
uint32_t pack_key = 0;
// dim number ???
vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
@ -220,58 +160,59 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
in0_fl = (int8_t)attr[0]->dfp.fl;
postshift0 = in0_fl - 0;
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
input0_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input0_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
src0ZP = attr[0]->asymm.zero_point;
src0Scale = attr[0]->asymm.scale;
gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postshift0);
input0_scale = attr[0]->asymm.scale;
input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
in1_fl = (int8_t)attr[1]->dfp.fl;
postshift1 = in1_fl - 0;
int32_t fl = attr[1]->dfp.fl;
if (fl > 0)
{
input1_scale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
input1_scale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
{
src1ZP = attr[1]->asymm.zero_point;
src1Scale = attr[1]->asymm.scale;
gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postshift1);
input1_scale = attr[1]->asymm.scale;
input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale;
}
if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
out_fl = (int8_t)attr[2]->dfp.fl;
if (out_fl > 0)
int32_t fl = attr[2]->dfp.fl;
if (fl > 0)
{
outScale_fl = (vx_float32)((int64_t)1 << out_fl);
output_scale = (float) ((int64_t)1 << fl);
}
else
{
outScale_fl = (1.0f / (vx_float32)((int64_t)1 << -out_fl));
output_scale = 1.0f / (float)((int64_t)1 << -fl);
}
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
{
dstZP = (float)attr[2]->asymm.zero_point;
dstScale = 1.0f / attr[2]->asymm.scale;
}
if ( out_shape->size < 3 )
{
zAx = 1;
}
else
{
zAx = out_shape->data[2];
output_zp = (float)attr[2]->asymm.zero_point;
output_scale = 1.0f / attr[2]->asymm.scale;
}
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
@ -287,269 +228,122 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = gpu_align_p2((out_shape->data[1] + shaderParam.global_scale[1] - 1)
/ shaderParam.global_scale[1], 2);
shaderParam.global_size[2] = gpu_align_p2((zAx + shaderParam.global_scale[2] - 1)
/ shaderParam.global_scale[2], 1);
shaderParam.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
switch( pack_key )
{
gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertFstDataToFp32_4x4_2 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertSecDataToFp32_4x4_2 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00010000, 0x00030002, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00050004, 0x00070006, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertUint8SubZpToFp32_4x4_2 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00010000, 0x00030002, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertSecUint8SubZpToFp32_4x4_2 = {{
0x09090909, // TCfg
0x04040404, // ASelt
0x00050004, 0x00070006, // ABin
0x0a0a0a0a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00000000, 0x00010001, 0x00000000,
0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
uint32_t multiplierA = (M0 << 16) | M0;
uint32_t multiplierB = (M1 << 16) | M1;
int32_t i = 8;
uniConvertUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F);
uniConvertSecUint8SubZpToFp32_4x4.data[7] |= (postshift0 & 0x1F);
uniConvertUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F);
uniConvertSecUint8SubZpToFp32_4x4_2.data[7] |= (postshift1 & 0x1F);
for ( i = 8; i < 16; i += 2 )
case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
{
uniConvertUint8SubZpToFp32_4x4.data[i] = multiplierA;
uniConvertSecUint8SubZpToFp32_4x4.data[i] = multiplierA;
uniConvertUint8SubZpToFp32_4x4_2.data[i] = multiplierB;
uniConvertSecUint8SubZpToFp32_4x4_2.data[i] = multiplierB;
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
&uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
&uniConvBF16toF32_Part1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
&uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
if ( attr[0]->dtype == I8 || attr[0]->dtype == I16 )
break;
default:
{
gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4, postshift0 );
gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4, postshift0 );
}
gpu_dp_inst_t uniConvertFstDataToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvertSecDataToFp32_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtact8Bit_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtactHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16};
if ( attr[1]->dtype == I8 || attr[1]->dtype == I16 )
{
gpu_dp_inst_update_postshfit( &uniConvertFstDataToFp32_4x4_2, postshift1 );
gpu_dp_inst_update_postshfit( &uniConvertSecDataToFp32_4x4_2, postshift1 );
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
&uniConvertFstDataToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
&uniConvertSecDataToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "input0_scale", &input0_scale);
status |= vsi_nn_kernel_gpu_add_param( node, "input1_scale", &input1_scale);
status |= vsi_nn_kernel_gpu_add_param( node, "input0_tail", &input0_tail);
status |= vsi_nn_kernel_gpu_add_param( node, "input1_tail", &input1_tail);
status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
if (attr[2]->dtype == F16)
{
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8",
&uniExtactHalf8_2x8);
}
else
{
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtact8Bit_2x8",
&uniExtact8Bit_2x8);
}
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
switch( pack_key )
{
case _PACK_SELECT_KEY( F16, F16, I8 ):
case _PACK_SELECT_KEY( F16, I8, F16 ):
case _PACK_SELECT_KEY( F16, I8, I8 ):
case _PACK_SELECT_KEY( F16, F16, I16 ):
case _PACK_SELECT_KEY( F16, I16, F16 ):
case _PACK_SELECT_KEY( F16, I16, I16 ):
case _PACK_SELECT_KEY( I8, F16, F16 ):
case _PACK_SELECT_KEY( I8, F16, I8 ):
case _PACK_SELECT_KEY( I8, I8, I8 ):
case _PACK_SELECT_KEY( I16, F16, F16 ):
case _PACK_SELECT_KEY( I16, F16, I16 ):
case _PACK_SELECT_KEY( I16, I16, I16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
&uniConvertFstDataToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
&uniConvertSecDataToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2",
&uniConvertFstDataToFp32_4x4_2);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2",
&uniConvertSecDataToFp32_4x4_2);
status |= vsi_nn_kernel_gpu_add_param(node, "outScale_fl", &outScale_fl);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U8, F16, F16 ):
case _PACK_SELECT_KEY( U8, F16, U8 ):
case _PACK_SELECT_KEY( U8, U8, U8 ):
case _PACK_SELECT_KEY( U8, U8, F16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4",
&uniConvertUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4",
&uniConvertSecUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4_2",
&uniConvertFstDataToFp32_4x4_2);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4_2",
&uniConvertSecDataToFp32_4x4_2);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2",
&uniConvertUint8SubZpToFp32_4x4_2);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2",
&uniConvertSecUint8SubZpToFp32_4x4_2);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
&uniConvertHalftoFp16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP0", &src0ZP);
status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP);
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( F16, F16, F16 ):
case _PACK_SELECT_KEY( F16, F16, U8 ):
case _PACK_SELECT_KEY( F16, U8, F16 ):
case _PACK_SELECT_KEY( F16, U8, U8 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertFstDataToFp32_4x4",
&uniConvertFstDataToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecDataToFp32_4x4",
&uniConvertSecDataToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUint8SubZpToFp32_4x4_2",
&uniConvertUint8SubZpToFp32_4x4_2);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecUint8SubZpToFp32_4x4_2",
&uniConvertSecUint8SubZpToFp32_4x4_2);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
&uniConvertHalfToFp16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "input_ZP1", &src1ZP);
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &dstZP);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
&uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
&uniConvBF16toF32_Part1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
&uniExtractOddData_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
default:
break;
}
#undef _PACK_SELECT_KEY
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
break;
}
#undef _PACK_SELECT_KEY
OnError:
if ( attr[0] )
@ -646,7 +440,6 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_POW_PARAM_NUM,
inputs, 2, outputs, 1 );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_POW_PARAM_NUM );
}
}
return node;
@ -655,4 +448,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( pow, _setup )

View File

@ -126,8 +126,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
dstZP = attr[0]->asymm.zero_point;
outputScale = attr[0]->asymm.scale;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
@ -152,7 +150,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f/outputScale;
outputScale = 1.0f / attr[0]->asymm.scale;
dstZP = attr[0]->asymm.zero_point;
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{

View File

@ -128,8 +128,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
out_shape = attr[0]->shape;
dstZP = (float)attr[0]->asymm.zero_point;
outputScale = attr[0]->asymm.scale;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
@ -147,7 +145,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
}
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f/outputScale;
outputScale = 1.0f / attr[0]->asymm.scale;
dstZP = (float)attr[0]->asymm.zero_point;
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{

View File

@ -148,8 +148,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
dstZP = attr[0]->asymm.zero_point;
dstScale = attr[0]->asymm.scale;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
@ -161,7 +159,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
dstScale = 1.0f / dstScale;
dstScale = 1.0f / attr[0]->asymm.scale;
dstZP = attr[0]->asymm.zero_point;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{

View File

@ -35,13 +35,15 @@
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
#define KERNEL_SOURCE_0 "pre_process_rgb888_planar_0",
#define KERNEL_SOURCE_1 "pre_process_rgb888_planar_1",
#define KERNEL_SOURCE_2 "pre_process_rgb888_planar_2",
#define RGB888_SEP_SOURCE_0 "pre_process_rgb888_planar_sep_0",
#define RGB888_SEP_SOURCE_1 "pre_process_rgb888_planar_sep_1",
#define RGB888_SEP_SOURCE_2 "pre_process_rgb888_planar_sep_2",
#define RGB888_SOURCE_0 "pre_process_rgb888_planar_0",
#define RGB888_SOURCE_1 "pre_process_rgb888_planar_1",
#define RGB888_SOURCE_2 "pre_process_rgb888_planar_2",
#define STR(a) #a
@ -53,28 +55,48 @@ typedef enum
HALF
} _internal_scale_e;
// Add kernel hashtable here
#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE_FLAG ) \
(( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | (SCALE_FLAG))
#define PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SEP, SCALE_FLAG ) \
(( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 ) | ( SEP << 4 ) | (SCALE_FLAG))
#define PACK_KERNEL_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, SCALE ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
KERNEL_SOURCE_0 }
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, SCALE ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
RGB888_SOURCE_0 }
#define PACK_KERNEL_SEP_SCALE_MAP( IN_DTYPE, OUT_DTYPE ) \
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, SCALE ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_scale_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
RGB888_SEP_SOURCE_0 }
#define PACK_KERNEL_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, COPY ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
KERNEL_SOURCE_1 }
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, COPY ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
RGB888_SOURCE_1 }
#define PACK_KERNEL_SEP_COPY_MAP( IN_DTYPE, OUT_DTYPE ) \
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, COPY ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_copy_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
RGB888_SEP_SOURCE_1 }
#define PACK_KERNEL_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, FOUR_OVER_THREE ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
KERNEL_SOURCE_2 }
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, FOUR_OVER_THREE ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
RGB888_SOURCE_2 }
#define PACK_KERNEL_SEP_4_OVER_3_MAP( IN_DTYPE, OUT_DTYPE ) \
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, FOUR_OVER_THREE ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_4over3_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
RGB888_SEP_SOURCE_2 }
#define PACK_KERNEL_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, HALF ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
KERNEL_SOURCE_2 }
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, HALF ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
RGB888_SOURCE_2 }
#define PACK_KERNEL_SEP_HALF_MAP( IN_DTYPE, OUT_DTYPE ) \
{ PRE_PROCESS_RGB888_PLANAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1, HALF ), \
CVIVANTE_NAMESPACE("evis.pre_process_rgb888_planar_sep_half_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
RGB888_SEP_SOURCE_2 }
typedef struct
{
@ -98,6 +120,19 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] =
PACK_KERNEL_4_OVER_3_MAP( U8, U8 ),
PACK_KERNEL_HALF_MAP( U8, U8 ),
PACK_KERNEL_SEP_SCALE_MAP( U8, F16 ),
PACK_KERNEL_SEP_SCALE_MAP( U8, I16 ),
PACK_KERNEL_SEP_SCALE_MAP( U8, I8 ),
PACK_KERNEL_SEP_SCALE_MAP( U8, U8 ),
PACK_KERNEL_SEP_COPY_MAP( U8, F16 ),
PACK_KERNEL_SEP_COPY_MAP( U8, I16 ),
PACK_KERNEL_SEP_COPY_MAP( U8, I8 ),
PACK_KERNEL_SEP_COPY_MAP( U8, U8 ),
PACK_KERNEL_SEP_4_OVER_3_MAP( U8, U8 ),
PACK_KERNEL_SEP_HALF_MAP( U8, U8 ),
};
@ -105,6 +140,23 @@ static const _kernel_map_type pre_process_rgb888_planar_kernel_map[] =
* Kernel params
*/
static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
static vx_param_description_t _pre_process_rgb888_planar_sep_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@ -121,7 +173,7 @@ static vx_param_description_t _pre_process_rgb888_planar_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def )
#define _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def )
/*
* Kernel initializer
@ -149,9 +201,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
{
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
}
else
{
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
}
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
@ -310,9 +369,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_size_array_t * out_shape = NULL;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
{
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
}
else
{
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
}
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[13], &output_scale);
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[param_size - 1], &output_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
@ -406,7 +472,14 @@ DEF_KERNEL_INITIALIZER(_resize_rgb888_planar_initializer)
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
if (param_size == _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def ))
{
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
}
else
{
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
}
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
out_shape = attr[1]->shape;
@ -540,6 +613,7 @@ static vsi_status _query_kernel
vsi_bool is_4_over_3 = FALSE;
vsi_bool is_half_scale = FALSE;
vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
vsi_bool is_rgb888_sep = (vsi_bool)(inputs[1] != NULL);
is_4_over_3 = (width * 3 == (int32_t)outputs[0]->attr.size[0] * 4) &&
(height * 3 == (int32_t)outputs[0]->attr.size[1] * 4);
@ -568,7 +642,7 @@ static vsi_status _query_kernel
}
}
key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, scale_type);
key = PRE_PROCESS_RGB888_PLANAR_HASH_KEY( input0_dtype, output_dtype, is_rgb888_sep, scale_type);
for ( i = 0; i < _cnt_of_array(pre_process_rgb888_planar_kernel_map); i ++ )
{
@ -581,8 +655,17 @@ static vsi_status _query_kernel
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",
pre_process_rgb888_planar_kernel_map[i].function_name );
kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
if (is_rgb888_sep)
{
kernel->info.parameters = _pre_process_rgb888_planar_sep_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_sep_kernel_param_def );
}
else
{
kernel->info.parameters = _pre_process_rgb888_planar_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _pre_process_rgb888_planar_kernel_param_def );
}
if (enable_copy)
{
@ -620,8 +703,9 @@ static vsi_nn_kernel_node_t _setup
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_PRE_PROCESS_RGB888_PLANAR_PARAM_NUM];
vsi_nn_kernel_node_param_t* node_params = NULL;
vsi_nn_kernel_node_t node = NULL;
int32_t param_count = _PRE_PROCESS_RGB888_PLANAR_SEP_PARAM_NUM;
int32_t width = vsi_nn_kernel_param_get_int32( params, "width" );
int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
float r_mean = vsi_nn_kernel_param_get_float32( params, "r_mean" );
@ -630,7 +714,10 @@ static vsi_nn_kernel_node_t _setup
float scale = vsi_nn_kernel_param_get_float32( params, "scale" );
vsi_bool is_no_range_change = FALSE;
if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
input_num = inputs[1] == NULL ? 1 : input_num;
param_count = inputs[1] == NULL ? _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM : param_count;
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
@ -648,17 +735,19 @@ static vsi_nn_kernel_node_t _setup
status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height );
if ( VSI_SUCCESS == status)
{
node_params = (vsi_nn_kernel_node_param_t *)malloc(sizeof(vsi_nn_kernel_node_param_t) * param_count);
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 6;
uint32_t index = inputs[1] == NULL ? 4 : 6;
uint32_t scalar_index = index;
int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" );
int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" );
int32_t left = vsi_nn_kernel_param_get_int32( params, "left" );
int32_t top = vsi_nn_kernel_param_get_int32( params, "top" );
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM,
vsi_nn_kernel_node_pack_io( node_params, param_count,
inputs, input_num, outputs, output_num );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
@ -670,17 +759,21 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &scale );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _PRE_PROCESS_RGB888_PLANAR_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
status = vsi_nn_kernel_node_pass_param( node, node_params, param_count );
index = scalar_index;
vsi_nn_kernel_scalar_release( &node_params[index++] );
vsi_nn_kernel_scalar_release( &node_params[index++] );
vsi_nn_kernel_scalar_release( &node_params[index++] );
vsi_nn_kernel_scalar_release( &node_params[index++] );
vsi_nn_kernel_scalar_release( &node_params[index++] );
vsi_nn_kernel_scalar_release( &node_params[index++] );
vsi_nn_kernel_scalar_release( &node_params[index++] );
vsi_nn_kernel_scalar_release( &node_params[index++] );
}
}
vsi_nn_safe_free(node_params);
return node;
} /* _setup() */

View File

@ -150,8 +150,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
outputZP = (float)attr[0]->asymm.zero_point;
outputScale = attr[0]->asymm.scale;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
@ -176,7 +174,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
outputScale = 1.0f / outputScale;
outputScale = 1.0f / attr[0]->asymm.scale;
outputZP = (float)attr[0]->asymm.zero_point;
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{

View File

@ -135,8 +135,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
dstZP = attr[0]->asymm.zero_point;
dstScale = attr[0]->asymm.scale;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
@ -151,9 +149,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
width = width / 3;
}
if (attr[0]->dtype == U8)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
dstScale = 1.0f / dstScale;
dstScale = 1.0f / attr[0]->asymm.scale;
dstZP = attr[0]->asymm.zero_point;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
}
dstZP = 0;
}
shaderParam.global_scale[0] = 16;

View File

@ -130,8 +130,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
out_shape = attr[0]->shape;
dstZP = attr[0]->asymm.zero_point;
dstScale = attr[0]->asymm.scale;
width = (uint32_t)(out_shape->data[0]);
height = (uint32_t)(out_shape->data[1]);
@ -141,9 +139,22 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
order1 = 0;
}
if (attr[0]->dtype == U8)
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
dstScale = 1.0f / dstScale;
dstScale = 1.0f / attr[0]->asymm.scale;
dstZP = attr[0]->asymm.zero_point;
}
else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[0]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
}
dstZP = 0;
}
shaderParam.global_scale[0] = 16;

View File

@ -51,6 +51,7 @@ typedef enum
UP_3X_HALF,
UP_4X_HALF,
UP_8X_HALF,
UP_8X_ALIGN,
} _internal_scale_e;
#define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_bilinear_"#_input_type
@ -102,6 +103,12 @@ typedef enum
"_SAME_3x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_8X_ALIGN( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_ALIGN ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_8x_upsample_align_corners"), \
"resize_bilinear_align_corners" }
typedef struct
{
uint32_t key;
@ -128,6 +135,7 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_8X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_8X_ALIGN(U8, U8),
};
@ -228,11 +236,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
uint32_t out_height;
float half_pixel_value = 0.0f;
vsi_bool is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size);
vsi_bool is_half_pixel_centers = FALSE;
vsi_bool is_2x_up_kernel = FALSE;
vsi_bool is_3x_up_kernel = FALSE;
vsi_bool is_4x_up_kernel = FALSE;
vsi_bool is_8x_up_kernel = FALSE;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@ -257,20 +260,20 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
if (align_corners && out_width > 1)
{
scale_factor[0] = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1);
}
else
{
scale_factor[0] = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width;
}
if (align_corners && out_height > 1)
{
scale_factor[1] = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1);
}
else
{
scale_factor[1] = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height;
}
if (half_pixel_centers)
@ -282,16 +285,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
half_pixel_value = 0.0f;
}
is_half_pixel_centers = (!align_corners) && (half_pixel_centers);
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)) && is_half_pixel_centers)
{
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
}
if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
{
input_scale = input_attr->asymm.scale;
@ -302,11 +295,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
srcFixPointPos = input_attr->dfp.fl;
if (srcFixPointPos >= 0)
{
input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
}
else if (srcFixPointPos < 0)
{
input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos);
input_scale = (float)((int64_t)1 << -srcFixPointPos);
}
inputZP = 0;
}
@ -326,11 +319,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
dstFixPointPos = output_attr->dfp.fl;
if (dstFixPointPos >= 0)
{
output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos);
output_scale = (float) ((int64_t)1 << dstFixPointPos);
}
else if (dstFixPointPos < 0)
{
output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
}
outputZP = 0;
}
@ -340,226 +333,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
outputZP = 0;
}
if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
{
gpu_param.global_scale[0] = 16;
gpu_param.global_scale[1] = 1;
}
else if (is_3x_up_kernel)
{
gpu_param.global_scale[0] = 15;
gpu_param.global_scale[1] = 6;
gpu_param.global_scale[2] = 1;
}
else
{
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
}
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
if (is_2x_up_kernel)
{
gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_3x_up_kernel)
{
gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
0x15515515, // TCfg
0x00000000, // ASelt
0x21210110, 0x03323202, // ABin
0x2aa2aa2a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
0x05155155, // TCfg
0x00000000, // ASelt
0x54044343, 0x00650554, // ABin
0x0a2aa2aa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
0x55551155, // TCfg
0x50501050, // ASelt
0x01011010, 0x21212121, // ABin
0xaaaa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
0x11555511, // TCfg
0x10505010, // ASelt
0x32320202, 0x03033232, // ABin
0x22aaaa22, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
0x55115555, // TCfg
0x50105050, // ASelt
0x43434343, 0x54540404, // ABin
0xaa22aaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
0x00551155, // TCfg
0x00501050, // ASelt
0x05055454, 0x00006565, // ABin
0x00aa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_4x_up_kernel)
{
gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_8x_up_kernel)
{
gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
float dfpScale = input_scale * output_scale;
gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
@ -840,7 +618,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
{
float uint8Scale = 1.0f / output_scale;
float uint8ZP_out = (vx_float32)outputZP;
float uint8ZP_out = (float)outputZP;
gpu_dp_inst_t uniExtact8Bit_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
@ -1045,11 +823,299 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
goto final;
}
if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel&& !is_8x_up_kernel)
status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
CHECK_STATUS_FAIL_GOTO(status, final );
gpu_param.global_size[0] = gpu_align_p2((out_width + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
return status;
} /* _resize_bilinear_initializer() */
DEF_KERNEL_INITIALIZER(_bilinear_half_pixel_centers_opt_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * out_shape = NULL;
vsi_size_array_t * in_shape = NULL;
vsi_nn_kernel_dtype_e input_dtype = F16;
uint32_t depth = 0;
uint32_t in_width = 0;
uint32_t in_height = 0;
uint32_t out_width = 0;
uint32_t out_height = 0;
vsi_bool is_2x_up_kernel = FALSE;
vsi_bool is_3x_up_kernel = FALSE;
vsi_bool is_4x_up_kernel = FALSE;
vsi_bool is_8x_up_kernel = FALSE;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
out_shape = output_attr->shape;
in_shape = input_attr->shape;
input_dtype = input_attr->dtype;
in_width = (uint32_t)(in_shape->data[0]);
in_height = (uint32_t)(in_shape->data[1]);
depth = (uint32_t)(in_shape->data[2]);
out_width = (uint32_t)(out_shape->data[0]);
out_height = (uint32_t)(out_shape->data[1]);
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
{
status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
}
if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
{
gpu_param.global_scale[0] = 16;
gpu_param.global_scale[1] = 1;
}
else if (is_3x_up_kernel)
{
gpu_param.global_scale[0] = 15;
gpu_param.global_scale[1] = 6;
gpu_param.global_scale[2] = 1;
}
else
{
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
}
if (is_2x_up_kernel)
{
gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_3x_up_kernel)
{
gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
0x15515515, // TCfg
0x00000000, // ASelt
0x21210110, 0x03323202, // ABin
0x2aa2aa2a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
0x05155155, // TCfg
0x00000000, // ASelt
0x54044343, 0x00650554, // ABin
0x0a2aa2aa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
0x55551155, // TCfg
0x50501050, // ASelt
0x01011010, 0x21212121, // ABin
0xaaaa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
0x11555511, // TCfg
0x10505010, // ASelt
0x32320202, 0x03033232, // ABin
0x22aaaa22, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
0x55115555, // TCfg
0x50105050, // ASelt
0x43434343, 0x54540404, // ABin
0xaa22aaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
0x00551155, // TCfg
0x00501050, // ASelt
0x05055454, 0x00006565, // ABin
0x00aa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_4x_up_kernel)
{
gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_8x_up_kernel)
{
gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
VSILOGE("input or output's format is not support");
status = VSI_FAILURE;
goto final;
}
if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
{
@ -1071,7 +1137,168 @@ final:
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
return status;
} /* _resize_bilinear_initializer() */
} /* _bilinear_half_pixel_centers_opt_initializer() */
DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * out_shape = NULL;
vsi_size_array_t * in_shape = NULL;
vsi_nn_kernel_dtype_e input_dtype = F16;
uint32_t depth = 0;
float scale_factor[2] = {0};
uint32_t in_width = 0;
uint32_t in_height = 0;
uint32_t out_width = 0;
uint32_t out_height = 0;
vsi_bool is_8x_align_corners = FALSE;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
out_shape = output_attr->shape;
in_shape = input_attr->shape;
input_dtype = input_attr->dtype;
in_width = (uint32_t)(in_shape->data[0]);
in_height = (uint32_t)(in_shape->data[1]);
depth = (uint32_t)(in_shape->data[2]);
out_width = (uint32_t)(out_shape->data[0]);
out_height = (uint32_t)(out_shape->data[1]);
if (out_width > 1)
{
scale_factor[0] = ((float)(in_width - 1) * 1.0f) / (float)(out_width - 1);
}
else
{
scale_factor[0] = ((float)in_width * 1.0f) / (float)out_width;
}
if (out_height > 1)
{
scale_factor[1] = ((float)(in_height - 1) * 1.0f) / (float)(out_height - 1);
}
else
{
scale_factor[1] = ((float)in_height * 1.0f) / (float)out_height;
}
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
{
is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] = 0.125f);
}
if (is_8x_align_corners)
{
gpu_param.global_scale[0] = 2;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
}
if (is_8x_align_corners)
{
gpu_dp_inst_t uniBilinear_8x_l10_4x8 = {{
0x55555505, 0x55555555, // TCfg
0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x00000838, 0x01070731, 0x02060e2a, 0x03051523,
0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniBilinear_8x_l11_4x8 = {{
0x55555505, 0x55555555, // TCfg
0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x00000838, 0x01070731, 0x02060e2a, 0x03051523,
0x04041c1c, 0x05032315, 0x06022a0e, 0x07013107 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniBilinear_8x_l20_4x8 = {{
0x55555505, 0x55555555, // TCfg
0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e,
0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniBilinear_8x_l21_4x8 = {{
0x55555505, 0x55555555, // TCfg
0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x00001030, 0x020e062a, 0x040c0c24, 0x060a121e,
0x08081818, 0x0a061e12, 0x0c04240c, 0x0e022a06 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniBilinear_8x_l30_4x8 = {{
0x55555505, 0x55555555, // TCfg
0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19,
0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniBilinear_8x_l31_4x8 = {{
0x55555505, 0x55555555, // TCfg
0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x00001828, 0x03150523, 0x06120a1e, 0x090f0f19,
0x0c0c1414, 0x0f09190f, 0x12061e0a, 0x15032305 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniBilinear_8x_l40_4x8 = {{
0x55555505, 0x55555555, // TCfg
0x02000200, 0x08c0208c, 0xc0208c02, 0x208c0208, 0x8c0208c0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x00002020, 0x041c041c, 0x08180818, 0x0c140c14,
0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniBilinear_8x_l41_4x8 = {{
0x55555505, 0x55555555, // TCfg
0x44100221, 0x19444194, 0x44419444, 0x41944419, 0x94441944, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x00002020, 0x041c041c, 0x08180818, 0x0c140c14,
0x10101010, 0x140c140c, 0x18081808, 0x1c041c04 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l10_4x8", &uniBilinear_8x_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l11_4x8", &uniBilinear_8x_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l20_4x8", &uniBilinear_8x_l20_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l21_4x8", &uniBilinear_8x_l21_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l30_4x8", &uniBilinear_8x_l30_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l31_4x8", &uniBilinear_8x_l31_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l40_4x8", &uniBilinear_8x_l40_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniBilinear_8x_l41_4x8", &uniBilinear_8x_l41_4x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
VSILOGE("input or output's format is not support");
status = VSI_FAILURE;
goto final;
}
gpu_param.global_size[0] = gpu_align_p2((in_width + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (in_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
return status;
} /* _bilinear_align_corners_opt_initializer() */
/*
* Query kernel
@ -1098,19 +1325,46 @@ static vsi_status _query_kernel
vx_kernel_initialize_f initializer = _resize_bilinear_initializer;
uint32_t key;
uint32_t i;
vsi_bool is_2x_upsample =(2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
vsi_bool is_3x_upsample =(3 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
vsi_bool is_8x_upsample =(8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (8 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
float width_scale = 0;
float height_scale = 0;
vsi_size_t input_width = inputs[0]->attr.size[0];
vsi_size_t input_height = inputs[0]->attr.size[1];
vsi_size_t output_width = outputs[0]->attr.size[0];
vsi_size_t output_height = outputs[0]->attr.size[1];
vsi_bool is_2x_upsample =(2 * input_width == output_width) \
&& (2 * input_height == output_height);
vsi_bool is_3x_upsample =(3 * input_width == output_width) \
&& (3 * input_height == output_height);
vsi_bool is_4x_upsample =(4 * input_width == output_width) \
&& (4 * input_height == output_height);
vsi_bool is_8x_upsample =(8 * input_width == output_width) \
&& (8 * input_height == output_height);
vsi_bool is_8x_align_corners = FALSE;
_internal_scale_e scale_flag = UP;
if (align_corners && outputs[0]->attr.size[0] > 1)
{
width_scale = ((float)(input_width - 1) * 1.0f) / (float)(output_width - 1);
}
else
{
width_scale = ((float)input_width * 1.0f) / (float)output_width;
}
if (align_corners && output_height > 1)
{
height_scale = ((float)(input_height - 1) * 1.0f) / (float)(output_height - 1);
}
else
{
height_scale = ((float)input_height * 1.0f) / (float)output_height;
}
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
is_8x_align_corners = (vsi_bool)( width_scale == 0.125f && height_scale == 0.125f && in_dtype == U8 );
is_2x_upsample &= (in_dtype == U8);
is_3x_upsample &= (in_dtype == U8);
is_4x_upsample &= (in_dtype == U8);
@ -1121,18 +1375,27 @@ static vsi_status _query_kernel
if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
{
scale_flag = UP_2X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer;
}
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
{
scale_flag = UP_3X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer;
}
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
{
scale_flag = UP_4X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer;
}
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
{
scale_flag = UP_8X_HALF;
initializer = _bilinear_half_pixel_centers_opt_initializer;
}
else if (is_same_type && (align_corners) && (!half_pixel_centers) && is_8x_align_corners)
{
scale_flag = UP_8X_ALIGN;
initializer = _bilinear_align_corners_opt_initializer;
}
else if (is_same_type && is_evis2)
{
@ -1240,20 +1503,20 @@ static vsi_nn_tensor_t* _create_scale_tensor
if (align_corners && width > 1)
{
width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(width - 1);
width_scale = ((float)(input_width - 1) * 1.0f) / (float)(width - 1);
}
else
{
width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)width;
width_scale = ((float)input_width * 1.0f) / (float)width;
}
if (align_corners && height > 1)
{
height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(height - 1);
height_scale = ((float)(input_height - 1) * 1.0f) / (float)(height - 1);
}
else
{
height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)height;
height_scale = ((float)input_height * 1.0f) / (float)height;
}
@ -1273,7 +1536,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
int32_t h0 = 0;
if (half_pixel_centers)
{
input_h = ((vx_float32)y + 0.5f) * height_scale - 0.5f;
input_h = ((float)y + 0.5f) * height_scale - 0.5f;
}
else
{
@ -1291,7 +1554,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
float br = 0.0f;
if (half_pixel_centers)
{
input_w = ((vx_float32)x + 0.5f) * width_scale - 0.5f;
input_w = ((float)x + 0.5f) * width_scale - 0.5f;
}
else
{

View File

@ -51,6 +51,15 @@ __BEGIN_DECLS
"_"STR(UP_SCALE)"x_upsample_half_pixel_centers"), \
"resize_bilinear_nhwc" }
#define BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \
(( IN_DTYPE ) | ( OUT_DTYPE << 8) | (UP_SCALE << 16))
#define BILINEAR_NHWC_BOUND_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, UP_SCALE ) \
{ BILINEAR_NHWC_BOUND_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_SCALE ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_nhwc_bound_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_"STR(UP_SCALE)"x"), \
"resize_bilinear_nhwc_bound" }
typedef struct
{
uint32_t key;
@ -65,6 +74,12 @@ static const _kernel_map_type _resize_bilinear_nhwc_kernel_map[] =
BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 4),
};
static const _kernel_map_type _bilinear_nhwc_bound_kernel_map[] =
{
BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 2),
BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 3),
BILINEAR_NHWC_BOUND_KERNEL_MAP(U8, U8, 4),
};
/*
* Kernel params
@ -81,6 +96,14 @@ static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] =
#define SCALAR_ALIGN_CORNERS (2)
#define SCALAR_HALF_PIXEL (3)
static vx_param_description_t _bilinear_nhwc_bound_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
};
#define _BILINEAR_NHWC_BOUND_PARAM_NUM _cnt_of_array( _bilinear_nhwc_bound_kernel_param_def )
/*
* Kernel initializer
*/
@ -382,50 +405,193 @@ final:
return status;
} /* _resize_bilinear_initializer() */
DEF_KERNEL_INITIALIZER(_bilinear_nhwc_bound_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * in_shape = NULL;
vsi_size_array_t * out_shape = NULL;
uint32_t x_coord[2] = {0};
uint32_t in_width;
uint32_t in_height;
uint32_t out_width;
uint32_t out_height;
vsi_bool is_2x_up_kernel = FALSE;
vsi_bool is_3x_up_kernel = FALSE;
vsi_bool is_4x_up_kernel = FALSE;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
in_shape = input_attr->shape;
out_shape = output_attr->shape;
in_width = (uint32_t)(in_shape->data[0]);
in_height = (uint32_t)(in_shape->data[1]);
out_width = (uint32_t)(out_shape->data[0]);
out_height = (uint32_t)(out_shape->data[1]);
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
if (is_2x_up_kernel)
{
gpu_dp_inst_t uniResize_x2_nhwc2_0_4x8 = {{
0x55555511, 0x55555555, // TCfg
0x46104000, 0x3a48829c, 0x4882acca, 0xc4acca3a, 0xbd4e5b50, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x000c0004, 0x09030301, 0x03090103, 0x03090103,
0x09030301, 0x09030301, 0x03090103, 0x03090103 // Constant
}, GPU_DP_TYPE_16};
gpu_param.global_scale[0] = 2;
gpu_param.global_scale[1] = 1;
x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
x_coord[0] = (x_coord[1] * 2 - 1) >> 2;
status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x2_nhwc2_0_4x8", &uniResize_x2_nhwc2_0_4x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_3x_up_kernel)
{
gpu_dp_inst_t uniResize_x3_nhwc2_l10_4x4 = {{
0x05055511, // TCfg
0x04045010, // ASelt
0x31310000, 0x00330022, // ABin
0x0a0aaa22, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x00005556, 0x00002aab, 0x38e41c72, 0x1c720e39,
0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_param.global_scale[0] = 3;
gpu_param.global_scale[1] = 1;
x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
x_coord[0] = (x_coord[1] - 1) / 6 * 2;
status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l10_4x4", &uniResize_x3_nhwc2_l10_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_4x_up_kernel)
{
gpu_dp_inst_t uniResize_x4_nhwc2_l00_4x8 = {{
0x55555511, 0x55555555, // TCfg
0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
0x00000706, // AccumType, ConstantType, and PostShift
0x00280018, 0x190f0f09, 0x23051503, 0x23051503,
0x05230315, 0x05230315, 0x0f19090f, 0x0f19090f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize_x4_nhwc2_l10_4x8 = {{
0x55555511, 0x55555555, // TCfg
0x46104000, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
0x00000706, // AccumType, ConstantType, and PostShift
0x00380008, 0x23150503, 0x31070701, 0x31070701,
0x07310107, 0x07310107, 0x15230305, 0x15230305 // Constant
}, GPU_DP_TYPE_16};
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
x_coord[1] = (uint32_t)(out_shape->data[0]) - 2;
x_coord[0] = ((x_coord[1] - 3) >> 3) * 2;
status = vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l00_4x8", &uniResize_x4_nhwc2_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l10_4x8", &uniResize_x4_nhwc2_l10_4x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else
{
VSILOGE("input or output's format is not support");
status = VSI_FAILURE;
goto final;
}
gpu_param.global_size[0] = gpu_align_p2((out_height + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = 1;
gpu_param.dim = 2;
status |= vsi_nn_kernel_gpu_add_param( node, "x_coord", &x_coord);
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
return status;
} /* _bilinear_nhwc_bound_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t align_corners,
int32_t half_pixel_centers,
uint32_t up_scale
const uint32_t hashkey,
uint32_t kernel_id
)
{
vx_kernel_initialize_f initializer = NULL;
vx_param_description_t * param_def;
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _resize_bilinear_nhwc_kernel_map;
size_t kernel_map_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_map );
vx_param_description_t * param_def = _resize_bilinear_nhwc_kernel_param_def;
size_t param_def_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def );
vx_kernel_initialize_f initializer = _resize_bilinear_nhwc_initializer;
uint32_t key;
uint32_t i;
const _kernel_map_type* kernel_map;
size_t kernel_map_size;
size_t param_size;
uint32_t i = 0;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
in_dtype = in_dtype == I8 ? U8 : in_dtype;
out_dtype = out_dtype == I8 ? U8 : out_dtype;
key = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers, align_corners, up_scale );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
switch( kernel_id )
{
if ( kernel_map[i].key == key )
case 0:
initializer = _resize_bilinear_nhwc_initializer;
kernel_map = _resize_bilinear_nhwc_kernel_map;
kernel_map_size = _cnt_of_array( _resize_bilinear_nhwc_kernel_map );
param_def = _resize_bilinear_nhwc_kernel_param_def;
param_size = _RESIZE_BILINEAR_NHWC_PARAM_NUM;
break;
case 1:
initializer = _bilinear_nhwc_bound_initializer;
kernel_map = _bilinear_nhwc_bound_kernel_map;
kernel_map_size = _cnt_of_array( _bilinear_nhwc_bound_kernel_map );
param_def = _bilinear_nhwc_bound_kernel_param_def;
param_size = _BILINEAR_NHWC_BOUND_PARAM_NUM;
break;
default:
VSI_ASSERT( FALSE );
return VSI_FAILURE;
}
for( i = 0; i < kernel_map_size; i ++ )
{
if( kernel_map[i].key == hashkey )
{
break;
}
}
if ( i < kernel_map_size )
if( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_def_size;
kernel->info.numParams = (uint32_t)param_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
@ -453,7 +619,8 @@ static vsi_nn_kernel_node_t _setup
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_param_t node0_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_param_t node1_params[_BILINEAR_NHWC_BOUND_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
@ -463,8 +630,14 @@ static vsi_nn_kernel_node_t _setup
float scale_y = (float)outputs[0]->attr.size[2] / (float)inputs[0]->attr.size[2];
float up_scale = scale_x == scale_y ? scale_x : 0;
uint32_t rank = inputs[0]->attr.dim_num;
vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
vsi_nn_kernel_t * ikernels[2] = { NULL };
uint32_t hashkeys[2] = {0};
uint32_t i = 0;
vsi_nn_tensor_attr_t attr;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
if (!is_same_type || depth != 2 || rank < 3 ||
(up_scale != 2.0f && up_scale != 3.0f && up_scale != 4.0f))
@ -472,8 +645,24 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
status = _query_kernel( kernel, inputs, outputs,
align_corners, half_pixel_centers, (uint32_t)up_scale);
ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
// Assign unique_id
ikernels[0]->unique_id = kernel->unique_id;
ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
// Assign unique_id
ikernels[1]->unique_id = kernel->unique_id;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
hashkeys[0] = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers,
align_corners, (vsi_size_t)up_scale );
hashkeys[1] = BILINEAR_NHWC_BOUND_HASH_KEY( in_dtype, out_dtype, (vsi_size_t)up_scale );
status = _query_kernel( ikernels[0], hashkeys[0], 0);
CHECK_STATUS_FAIL_GOTO(status, final );
status = _query_kernel( kernel, hashkeys[1], 1);
CHECK_STATUS_FAIL_GOTO(status, final );
shapes[0][0] = depth * inputs[0]->attr.size[1];
shapes[0][1] = inputs[0]->attr.size[2];
@ -491,26 +680,41 @@ static vsi_nn_kernel_node_t _setup
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[1], rank );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
reshape_tensors, input_num, &reshape_tensors[1], output_num );
node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
// resize bilinear
node = vsi_nn_kernel_create_node( graph, ikernels[0] );
VSI_ASSERT( node != NULL );
vsi_nn_kernel_node_pack_io( node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
reshape_tensors, input_num, &reshape_tensors[1], output_num );
node0_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
node0_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
status = vsi_nn_kernel_node_pass_param( node, node0_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node0_params[SCALAR_ALIGN_CORNERS] );
vsi_nn_kernel_scalar_release( &node0_params[SCALAR_HALF_PIXEL] );
vsi_nn_kernel_node_release( &node );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
// update bound for output tensor
memcpy( &attr, &(reshape_tensors[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
attr.size[0] = 1;
attr.size[1] = 1;
attr.dim_num = 2;
reshape_tensors[2] = vsi_nn_CreateTensor( graph, &attr );
node = vsi_nn_kernel_create_node( graph, kernel );
VSI_ASSERT( node != NULL );
vsi_nn_kernel_node_pack_io( node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM,
reshape_tensors, 2, &reshape_tensors[2], 1 );
status = vsi_nn_kernel_node_pass_param( node, node1_params, _BILINEAR_NHWC_BOUND_PARAM_NUM );
final:
for( i = 0; i < 2; i ++ )
{
if( ikernels[i] )
{
vsi_nn_kernel_release( &ikernels[i] );
}
}
vsi_safe_release_tensor(reshape_tensors[0]);
vsi_safe_release_tensor(reshape_tensors[1]);
vsi_safe_release_tensor(reshape_tensors[2]);
return node;
} /* _setup() */

View File

@ -118,7 +118,7 @@ static vsi_status get_scatter_nd_tensor_reshape_size
return status;
}
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
newDim[0] = 0;
for(i = 0; i < dims_num; ++i)

View File

@ -207,7 +207,7 @@ static vsi_status get_scatter_nd_update_tensor_reshape_size
return status;
}
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
#define VSI_NN_MAX_IMAGE_WIDTH GPU_TENSOR_MAX_WIDTH
newDim[0] = 0;
for(i = 0; i < dims_num; ++i)

View File

@ -75,10 +75,24 @@ static const _kernel_map_type _select_kernel_map[] =
PACK_KERNEL_MAP(I8, U8, U8, U8),
PACK_KERNEL_MAP(I8, I16, I16, I16),
PACK_KERNEL_MAP(I8, F16, F16, F16),
PACK_KERNEL_MAP(I8, F16, U8, F16),
PACK_KERNEL_MAP(I8, U8, F16, F16),
PACK_KERNEL_MAP(I8, F16, I8, F16),
PACK_KERNEL_MAP(I8, I8, F16, F16),
PACK_KERNEL_MAP(I8, F16, I16, F16),
PACK_KERNEL_MAP(I8, I16, F16, F16),
PACK_KERNEL_MAP(I8, F16, F16, U8),
PACK_KERNEL_MAP_2D(I8, I8, I8, I8),
PACK_KERNEL_MAP_2D(I8, U8, U8, U8),
PACK_KERNEL_MAP_2D(I8, I16, I16, I16),
PACK_KERNEL_MAP_2D(I8, F16, F16, F16),
PACK_KERNEL_MAP_2D(I8, U8, F16, F16),
PACK_KERNEL_MAP_2D(I8, F16, U8, F16),
PACK_KERNEL_MAP_2D(I8, F16, I8, F16),
PACK_KERNEL_MAP_2D(I8, I8, F16, F16),
PACK_KERNEL_MAP_2D(I8, F16, I16, F16),
PACK_KERNEL_MAP_2D(I8, I16, F16, F16),
PACK_KERNEL_MAP_2D(I8, F16, F16, U8),
};
/*
@ -142,7 +156,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input0_fl = input0_attr->dfp.fl;
if (input0_fl > 0)
@ -154,13 +168,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
input0Scale = (float)((int64_t)1 << -input0_fl);
}
}
else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input0Scale = input0_attr->asymm.scale;
input0Zp = input0_attr->asymm.zero_point;
}
if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
input1_fl = input1_attr->dfp.fl;
if (input1_fl > 0)
@ -172,13 +186,13 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
input1Scale = (float)((int64_t)1 << -input1_fl);
}
}
else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
input1Scale = input1_attr->asymm.scale;
input1Zp = input1_attr->asymm.zero_point;
}
if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
{
output_fl = output_attr->dfp.fl;
if (output_fl > 0)
@ -190,7 +204,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
outputScale = (float)((int64_t)1 << -output_fl);
}
}
else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outputScale = output_attr->asymm.scale;
outputZP = output_attr->asymm.zero_point;
@ -203,13 +217,10 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
output_shape = output_attr->shape;
gpu_param.dim = output_shape->size < 3 ? 2 : 3;
gpu_param.global_offset[0] = 0;
gpu_param.global_offset[1] = 0;
gpu_param.global_offset[2] = 0;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (output_shape->data[1] + gpu_param.global_scale[1] - 1)
@ -218,83 +229,8 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
(output_shape->data[2] + gpu_param.global_scale[2] - 1)
/ gpu_param.global_scale[2] : 1;
switch( pack_key )
{
case _PACK_SELECT_KEY( I8, I8, I8 ):
case _PACK_SELECT_KEY( I16, I16, I16 ):
{
gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvIntIn0toDst_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvIntIn1toDst_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
if (input0_fl >= output_fl)
{
uint8_t postshift = (uint8_t)gpu_min(input0_fl - output_fl, MAX_POST_SHIFT_BITS);
uniConvIntIn0toDst_2x8.data[7] = uniConvIntIn0toDst_2x8.data[7] | (postshift & 0x1F);
}
else
{
uint32_t idx = 0;
uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input0_fl), MAX_MULTIPLIER_NUM);
for (idx = 8; idx < 16; idx ++)
{
uniConvIntIn0toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff);
}
}
if (input1_fl >= output_fl)
{
uint8_t postshift = (uint8_t)gpu_min(input1_fl - output_fl, MAX_POST_SHIFT_BITS);
uniConvIntIn1toDst_2x8.data[7] = uniConvIntIn1toDst_2x8.data[7] | (postshift & 0x1F);
}
else
{
uint32_t idx = 0;
uint32_t multiplier = gpu_min((int64_t)1 << (output_fl - input1_fl), MAX_MULTIPLIER_NUM);
for (idx = 8; idx < 16; idx ++)
{
uniConvIntIn1toDst_2x8.data[idx] = (uint32_t)(multiplier << 16) | (multiplier & 0xffff);
}
}
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvIntIn0toDst_2x8", &uniConvIntIn0toDst_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvIntIn1toDst_2x8", &uniConvIntIn1toDst_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case _PACK_SELECT_KEY( F16, F16, F16 ):
{
gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
@ -312,61 +248,66 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case _PACK_SELECT_KEY( I8, I8, I8 ):
case _PACK_SELECT_KEY( I16, I16, I16 ):
case _PACK_SELECT_KEY( U8, U8, U8 ):
case _PACK_SELECT_KEY( I8, F16, F16 ):
case _PACK_SELECT_KEY( U8, F16, F16 ):
case _PACK_SELECT_KEY( I16, F16, F16 ):
case _PACK_SELECT_KEY( F16, U8, F16 ):
case _PACK_SELECT_KEY( F16, I8, F16 ):
case _PACK_SELECT_KEY( F16, I16, F16 ):
case _PACK_SELECT_KEY( F16, F16, U8 ):
{
uint32_t idx = 0;
gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In0_2x8 = {{
0x99999999, // TCfg
0x44444444, // ASelt
uint32_t multAndoutZP0[2] = {0};
uint32_t multAndoutZP1[2] = {0};
gpu_dp_inst_t uniConvConditiontoDst_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0xaaaaaaaa, // BSelt
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001,
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniU8SubZP_MulM_PStoF16In1_2x8 = {{
0x99999999, // TCfg
gpu_dp_inst_t uniU8MulAndPostShift0_Lo_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x03020100, 0x07060504, // ABin
0xaaaaaaaa, // BSelt
0x13121110, 0x17161514, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001,
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniU8AddZP_2x8 = {{
0x55555555, // TCfg
gpu_dp_inst_t uniU8MulAndPostShift1_Lo_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x03020100, 0x07060504, // ABin
0xaaaaaaaa, // BSelt
0x13121110, 0x17161514, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001,
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
uniU8SubZP_MulM_PStoF16In0_2x8.data[7] |= (in0_postShift & 0x1F);
uniU8SubZP_MulM_PStoF16In1_2x8.data[7] |= (in1_postShift & 0x1F);
multAndoutZP0[0] = (uint32_t)(in0_M0);
multAndoutZP0[1] = (uint32_t)((outputZP << in0_postShift) - input0Zp * in0_M0);
multAndoutZP1[0] = (uint32_t)(in1_M0);
multAndoutZP1[1] = (uint32_t)((outputZP << in1_postShift) - input1Zp * in1_M0);
for (idx = 8; idx < 16; idx ++)
{
uniU8SubZP_MulM_PStoF16In0_2x8.data[idx] = (vx_uint32)(in0_M0 << 16) | in0_M0;
uniU8SubZP_MulM_PStoF16In1_2x8.data[idx] = (vx_uint32)(in1_M0 << 16) | in1_M0;
}
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift0_Lo_2x8, in0_postShift );
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift1_Lo_2x8, in1_postShift );
status = vsi_nn_kernel_gpu_add_param( node,
"uniU8SubZP_MulM_PStoF16In0_2x8", &uniU8SubZP_MulM_PStoF16In0_2x8 );
status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8SubZP_MulM_PStoF16In1_2x8", &uniU8SubZP_MulM_PStoF16In1_2x8 );
"uniConvConditiontoDst_2x8", &uniConvConditiontoDst_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8AddZP_2x8", &uniU8AddZP_2x8 );
"uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift0_Lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"input0Zp", &input0Zp );
status |= vsi_nn_kernel_gpu_add_param( node,
"input1Zp", &input1Zp );
status |= vsi_nn_kernel_gpu_add_param( node,
"outputZP", &outputZP );
"uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift1_Lo_2x8 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
@ -501,4 +442,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( select, _setup )

View File

@ -39,7 +39,6 @@
__BEGIN_DECLS
#define _SLICE_KERNEL_SOURCE "slice"
#define _SLICE_KERNEL_NAME CVIVANTE_NAMESPACE("evis.slice")
// Add kernel hashtable here
@ -50,30 +49,30 @@ __BEGIN_DECLS
#define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D, _SAMEFL) \
(( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D << 1) | (_SAMEFL))
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
#define PACK_KERNEL_MAP_3D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \
SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
#define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D")
#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \
SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
#define SLICE_SH_KERNEL_SAMEFL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL")
#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \
SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
#define SLICE_SH_KERNEL_SAMEFL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL_2D")
#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ), \
SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), "slice" }
typedef struct
{
@ -85,21 +84,33 @@ __BEGIN_DECLS
static const _kernel_map_type _slice_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP( I8, I32, I8, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_3D( F16, I32, F16 ),
PACK_KERNEL_MAP_3D( F16, I32, I8 ),
PACK_KERNEL_MAP_3D( F16, I32, U8 ),
PACK_KERNEL_MAP_3D( F16, I32, I16 ),
PACK_KERNEL_MAP_3D( I8, I32, F16 ),
PACK_KERNEL_MAP_3D( U8, I32, F16 ),
PACK_KERNEL_MAP_3D( I16, I32, F16 ),
PACK_KERNEL_MAP_3D( I16, I32, I16 ),
PACK_KERNEL_MAP_3D( U8, I32, U8 ),
PACK_KERNEL_MAP_3D( I8, I32, I8 ),
PACK_KERNEL_MAP_2D( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_2D( I8, I32, I8, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_2D( F16, I32, F16 ),
PACK_KERNEL_MAP_2D( I16, I32, I16 ),
PACK_KERNEL_MAP_2D( F16, I32, I8 ),
PACK_KERNEL_MAP_2D( F16, I32, U8 ),
PACK_KERNEL_MAP_2D( F16, I32, I16 ),
PACK_KERNEL_MAP_2D( I8, I32, F16 ),
PACK_KERNEL_MAP_2D( U8, I32, F16 ),
PACK_KERNEL_MAP_2D( I16, I32, F16 ),
PACK_KERNEL_MAP_2D( U8, I32, U8 ),
PACK_KERNEL_MAP_2D( I8, I32, I8 ),
PACK_KERNEL_MAP_SAMEFL( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_SAMEFL( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_SAMEFL( I16, I32, I16 ),
PACK_KERNEL_MAP_SAMEFL( U8, I32, U8 ),
PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_SAMEFL_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16 ),
PACK_KERNEL_MAP_SAMEFL_2D( U8, I32, U8 ),
};
#define _INPUT_NUM (2)
@ -201,18 +212,16 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
scaleOut = output_attr->asymm.scale;
}
if ((F16 == input_dtype)
|| (I16 == input_dtype)
|| (BF16 == input_dtype)
)
if ((I8 == input_dtype && input_dtype == output_dtype ) ||
(U8 == input_dtype && input_dtype == output_dtype ) )
{
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[0] = 16;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
}
else
{
gpu_param.global_scale[0] = 16;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
}

View File

@ -1416,31 +1416,42 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create
switch( attr->quant )
{
case VSI_NN_KERNEL_QUANT_DFP:
{
{
int8_t fl = 0;
status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_FIXED_POINT_POS,
&fl, sizeof(int8_t));
CHECK_STATUS( status );
attr->dfp.fl = (int32_t)fl;
if (fl >= 0) {
attr->scale = 1.0f / ((float)((int64_t)1 << fl));
} else {
attr->scale = (float)((int64_t)1 << -fl);
}
break;
} break;
case VSI_NN_KERNEL_QUANT_ASYMM:
{
status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_ZERO_POINT,
&(attr->asymm.zero_point), sizeof(int32_t));
CHECK_STATUS( status );
status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_SCALE,
&(attr->asymm.scale), sizeof(float));
CHECK_STATUS( status );
{
status = vxQueryTensor((vx_tensor)tensor,
VX_TENSOR_ZERO_POINT,
&(attr->asymm.zero_point),
sizeof(int32_t));
CHECK_STATUS(status);
status = vxQueryTensor((vx_tensor)tensor,
VX_TENSOR_SCALE,
&(attr->asymm.scale),
sizeof(float));
CHECK_STATUS(status);
// Reset scale to 1e-8
if( (attr->asymm.scale - 0.f) < 1e-8 )
{
if ((attr->asymm.scale - 0.f) < 1e-8)
{
attr->asymm.scale = (float)1e-8;
attr->asymm.zero_point = 0;
}
}
break;
attr->scale = attr->asymm.scale;
attr->zero_point = attr->asymm.zero_point;
}
break;
default:
attr->scale = 1.0f;
break;
}
return attr;

View File

@ -189,6 +189,16 @@ static float celu_eval(float x, vsi_nn_kernel_lut_params *lut_param)
return positive + negative;
}
static float rcp_eval(float x)
{
return 1.0f / x;
}
static float softsign_eval(float x)
{
return x / (1 + vsi_abs(x));
}
static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
{
float result = 0;
@ -245,6 +255,12 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
case VSI_NN_KERNEL_LUT_CELU:
result = celu_eval(data, lut_param);
break;
case VSI_NN_KERNEL_LUT_RCP:
result = rcp_eval(data);
break;
case VSI_NN_KERNEL_LUT_SOFTSIGN:
result = softsign_eval(data);
break;
default:
VSILOGE( "unsupported activation function:%d", lut_param->act_type );
break;

View File

@ -133,5 +133,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu)
REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu)
REGISTER_VX_FIRST_KERNEL_SELECTOR(matrixmul)
REGISTER_VX_FIRST_KERNEL_SELECTOR(celu)
REGISTER_VX_FIRST_KERNEL_SELECTOR(rcp)
REGISTER_VX_FIRST_KERNEL_SELECTOR(softsign)
REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_bilinear)
REGISTER_VX_FIRST_KERNEL_SELECTOR(resize_nearest)
__END_DECLS

View File

@ -146,6 +146,8 @@ REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf, VSI_NN_KERNEL_LUT_ERF )
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras, VSI_NN_KERNEL_LUT_RELU_KERAS )
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip, VSI_NN_KERNEL_LUT_CLIP )
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( celu, VSI_NN_KERNEL_LUT_CELU )
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( rcp, VSI_NN_KERNEL_LUT_RCP )
REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( softsign, VSI_NN_KERNEL_LUT_SOFTSIGN )
#undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL

View File

@ -0,0 +1,152 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
#define REGISTER_SOFTMAX_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vx_node node = NULL;
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
int32_t type = vsi_nn_kernel_param_get_int32( params, "type" );
#ifdef VX_SCALE_EXTRA_PARAMETER_SUPPORT
vx_nn_scale_params_ext_t param;
param.align_corners = align_corners;
param.half_pixel_centers = half_pixel_centers;
switch (type)
{
case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
break;
case VSI_NN_INTERPOLATION_BILINEAR:
param.base.type = VX_INTERPOLATION_BILINEAR;
break;
case VSI_NN_INTERPOLATION_AREA:
param.base.type = VX_INTERPOLATION_AREA;
break;
default:
param.base.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
}
node = vxTensorScaleNode( graph->g,
inputs[0]->t,
(vx_nn_scale_params)(&param),
sizeof(vx_nn_scale_params_ext_t),
outputs[0]->t );
#else
vx_nn_scale_params_t param;
if (align_corners || half_pixel_centers)
{
return NULL;
}
switch (type)
{
case VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR:
param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
break;
case VSI_NN_INTERPOLATION_BILINEAR:
param.type = VX_INTERPOLATION_BILINEAR;
break;
case VSI_NN_INTERPOLATION_AREA:
param.type = VX_INTERPOLATION_AREA;
break;
default:
param.type = VX_INTERPOLATION_NEAREST_NEIGHBOR;
break;
}
node = vxTensorScaleNode( graph->g,
inputs[0]->t,
&param,
sizeof(param),
outputs[0]->t );
#endif
if ( NULL == node )
{
VSILOGI("Call vxTensorScaleNode fail.(resize)");
}
return (vsi_nn_kernel_node_t)node;
} /* _setup() */
#define REGISTER_RESIZE_OPENVX_KERNEL(KERNEL_NAME) \
static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num, \
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
) \
{ \
return _setup(graph, inputs, input_num, outputs, output_num, \
params, kernel); \
} \
REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
REGISTER_RESIZE_OPENVX_KERNEL( resize_nearest )
REGISTER_RESIZE_OPENVX_KERNEL( resize_bilinear )
#undef REGISTER_RESIZE_OPENVX_KERNEL

View File

@ -32,7 +32,6 @@
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_lut.h"
static vsi_nn_kernel_node_t _setup
(
@ -46,57 +45,7 @@ static vsi_nn_kernel_node_t _setup
)
{
vx_node node = NULL;
#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
vx_lut lut1 = NULL;
vx_lut lut2 = NULL;
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_lut_params lut_param;
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ||
outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
{
return NULL;
}
lut_param.act_type = VSI_NN_KERNEL_LUT_SQUARE;
lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
if( NULL == lut1 || NULL == lut2 )
{
VSILOGE("create lut object fail.");
goto final;
}
status = vsi_nn_kernel_lut(lut1, lut2, &lut_param);
CHECK_STATUS_FAIL_GOTO(status, final);
node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
if ( NULL == node )
{
node = vxActivationLayer(
graph->g,
inputs[0]->t,
VX_NN_ACTIVATION_SQUARE,
0,
0,
outputs[0]->t
);
}
final:
if (lut1)
{
vxReleaseLUT(&lut1);
lut1 = NULL;
}
if (lut2)
{
vxReleaseLUT(&lut2);
lut2 = NULL;
}
return (vsi_nn_kernel_node_t)node;
#else
node = vxActivationLayer(
graph->g,
inputs[0]->t,
@ -107,7 +56,6 @@ final:
);
return (vsi_nn_kernel_node_t)node;
#endif
} /* _setup() */
#define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME) \

View File

@ -0,0 +1,478 @@
__kernel void cumsum_F32toF32_axis2(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int channel,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
float4 sum = (float4)(0);
if(exclusive && rev)
{
coord_out.z = channel - 1;
write_imagef(output, coord_out, sum);
for(coord.z = channel - 1; coord.z > 0; coord.z--)
{
float4 data = read_imagef(input, coord);
coord_out.z--;
sum += data;
write_imagef(output, coord_out, sum);
}
}
else if(exclusive)
{
coord_out.z = 0;
write_imagef(output, coord_out, sum);
for(coord.z = 0; coord.z < channel - 1; coord.z++)
{
float4 data = read_imagef(input, coord);
coord_out.z++;
sum += data;
write_imagef(output, coord_out, sum);
}
}
else if(rev)
{
for(coord.z = channel - 1; coord.z >= 0; coord.z--)
{
float4 data = read_imagef(input, coord);
sum += data;
write_imagef(output, coord, sum);
}
}
else
{
for(coord.z = 0; coord.z < channel; coord.z++)
{
float4 data = read_imagef(input, coord);
sum += data;
write_imagef(output, coord, sum);
}
}
}
__kernel void cumsum_U8toU8_axis2(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int channel,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
uint4 sum = (uint4)(0);
uint4 dst = (uint4)(0);
float cnt = 0.0f;
if(exclusive && rev)
{
coord_out.z = channel - 1;
write_imageui(output, coord_out, dst);
for(coord.z = channel - 1; coord.z > 0; coord.z--)
{
uint4 data = read_imageui(input, coord);
coord_out.z--;
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord_out, dst);
}
}
else if(exclusive)
{
coord_out.z = 0;
write_imageui(output, coord_out, dst);
for(coord.z = 0; coord.z < channel - 1; coord.z++)
{
uint4 data = read_imageui(input, coord);
coord_out.z++;
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord_out, dst);
}
}
else if(rev)
{
for(coord.z = channel - 1; coord.z >= 0; coord.z--)
{
uint4 data = read_imageui(input, coord);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord, dst);
}
}
else
{
for(coord.z = 0; coord.z < channel; coord.z++)
{
uint4 data = read_imageui(input, coord);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord, dst);
}
}
}
__kernel void cumsum_F32toF32_axis1(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int channel,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
float4 sum = (float4)(0);
if(exclusive && rev)
{
coord_out.y = height - 1;
write_imagef(output, coord_out, sum);
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
float4 data = read_imagef(input, coord);
coord_out.y--;
sum += data;
write_imagef(output, coord_out, sum);
}
}
else if(exclusive)
{
coord_out.y = 0;
write_imagef(output, coord_out, sum);
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
float4 data = read_imagef(input, coord);
coord_out.y++;
sum += data;
write_imagef(output, coord_out, sum);
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
float4 data = read_imagef(input, coord);
sum += data;
write_imagef(output, coord, sum);
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
float4 data = read_imagef(input, coord);
sum += data;
write_imagef(output, coord, sum);
}
}
}
__kernel void cumsum_U8toU8_axis1(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int channel,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
uint4 sum = (uint4)(0);
uint4 dst = (uint4)(0);
float cnt = 0;
if(exclusive && rev)
{
coord_out.y = height - 1;
write_imageui(output, coord_out, dst);
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
uint4 data = read_imageui(input, coord);
cnt += 1.0f;
coord_out.y--;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord_out, dst);
}
}
else if(exclusive)
{
coord_out.y = 0;
write_imageui(output, coord_out, dst);
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
uint4 data = read_imageui(input, coord);
cnt += 1.0f;
coord_out.y++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord_out, dst);
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
uint4 data = read_imageui(input, coord);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord, dst);
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
uint4 data = read_imageui(input, coord);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord, dst);
}
}
}
__kernel void cumsum_F32toF32_axis0(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int channel,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
float4 sum = (float4)(0);
if(exclusive && rev)
{
coord_out.x = width - 1;
write_imagef(output, coord_out, sum);
for(coord.x = width - 1; coord.x > 0; coord.x--)
{
float4 data = read_imagef(input, coord);
coord_out.x--;
sum += data;
write_imagef(output, coord_out, sum);
}
}
else if(exclusive)
{
coord_out.x = 0;
write_imagef(output, coord_out, sum);
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
float4 data = read_imagef(input, coord);
coord_out.x++;
sum += data;
write_imagef(output, coord_out, sum);
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
float4 data = read_imagef(input, coord);
sum += data;
write_imagef(output, coord, sum);
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
float4 data = read_imagef(input, coord);
sum += data;
write_imagef(output, coord, sum);
}
}
}
__kernel void cumsum_U8toU8_axis0(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int channel,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_out = coord;
uint4 sum = (uint4)(0);
uint4 dst = (uint4)(0);
float cnt = 0;
if(exclusive && rev)
{
coord_out.x = width - 1;
write_imageui(output, coord_out, dst);
for(coord.x = width - 1; coord.x > 0; coord.x--)
{
uint4 data = read_imageui(input, coord);
coord_out.x--;
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord_out, dst);
}
}
else if(exclusive)
{
coord_out.x = 0;
write_imageui(output, coord_out, dst);
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
uint4 data = read_imageui(input, coord);
coord_out.x++;
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord_out, dst);
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
uint4 data = read_imageui(input, coord);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord, dst);
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
uint4 data = read_imageui(input, coord);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord, dst);
}
}
}

View File

@ -0,0 +1,314 @@
__kernel void cumsum_F32toF32_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float4 sum = (float4)(0);
if(exclusive && rev)
{
coord.w = height - 1;
write_imagef(output, coord.zw, sum);
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
float4 data = read_imagef(input, coord.xy);
coord.w--;
sum += data;
write_imagef(output, coord.zw, sum);
}
}
else if(exclusive)
{
write_imagef(output, coord.zw, sum);
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
float4 data = read_imagef(input, coord.xy);
coord.w++;
sum += data;
write_imagef(output, coord.zw, sum);
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
float4 data = read_imagef(input, coord.xy);
sum += data;
write_imagef(output, coord.xy, sum);
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
float4 data = read_imagef(input, coord.xy);
sum += data;
write_imagef(output, coord.xy, sum);
}
}
}
__kernel void cumsum_U8toU8_axis1_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
uint4 sum = (uint4)(0);
uint4 dst = (uint4)(0);
float cnt = 0;
if(exclusive && rev)
{
coord.w = height - 1;
write_imageui(output, coord.zw, sum);
for(coord.y = height - 1; coord.y > 0; coord.y--)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
coord.w--;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(exclusive)
{
write_imageui(output, coord.zw, sum);
for(coord.y = 0; coord.y < height - 1; coord.y++)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
coord.w++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(rev)
{
for(coord.y = height - 1; coord.y >= 0; coord.y--)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
else
{
for(coord.y = 0; coord.y < height; coord.y++)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
}
__kernel void cumsum_F32toF32_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
float4 sum = (float4)(0);
if(exclusive && rev)
{
coord.x = width - 1;
coord.z = coord.x;
write_imagef(output, coord.zw, sum);
for(; coord.x > 0; coord.x--)
{
float4 data = read_imagef(input, coord.xy);
coord.z--;
sum += data;
write_imagef(output, coord.zw, sum);
}
}
else if(exclusive)
{
coord.z = 0;
write_imagef(output, coord.zw, sum);
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
float4 data = read_imagef(input, coord.xy);
coord.z++;
sum += data;
write_imagef(output, coord.zw, sum);
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
float4 data = read_imagef(input, coord.xy);
sum += data;
write_imagef(output, coord.xy, sum);
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
float4 data = read_imagef(input, coord.xy);
sum += data;
write_imagef(output, coord.xy, sum);
}
}
}
__kernel void cumsum_U8toU8_axis0_2D(
__read_only image2d_t input,
__write_only image2d_t output,
int axis,
int exclusive,
int rev,
int width,
int height,
int chn,
int input_zp,
float in_out_scale,
float in_out_zp_scale,
float output_zp
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
uint4 sum = (uint4)(0);
uint4 dst = (uint4)(0);
float cnt = 0.0f;
if(exclusive && rev)
{
coord.x = width - 1;
coord.z = coord.x;
write_imageui(output, coord.zw, sum);
for(; coord.x > 0; coord.x--)
{
uint4 data = read_imageui(input, coord.xy);
coord.z--;
cnt += 1.0;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(exclusive)
{
coord.z = 0;
write_imageui(output, coord.zw, sum);
for(coord.x = 0; coord.x < width - 1; coord.x++)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
coord.z++;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.zw, dst);
}
}
else if(rev)
{
for(coord.x = width - 1; coord.x >= 0; coord.x--)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
else
{
for(coord.x = 0; coord.x < width; coord.x++)
{
uint4 data = read_imageui(input, coord.xy);
cnt += 1.0f;
sum += data;
float tmpAlpha = cnt * in_out_zp_scale + output_zp;
float tmpSum = sum.x * in_out_scale + tmpAlpha;
dst.x = (uint)convert_int_rte(tmpSum);
write_imageui(output, coord.xy, dst);
}
}
}

Some files were not shown because too many files have changed in this diff Show More