Update internal to 1.1.32
SHA: 9aa0b0f Signed-off-by: Kainan Cha <kainan.zha@verisilicon.com>
This commit is contained in:
parent
98b9759663
commit
3c59694025
|
|
@ -69,7 +69,9 @@ filegroup(
|
|||
name = "custom_srcs",
|
||||
srcs = glob([
|
||||
"src/custom/ops/*.c",
|
||||
"src/custom/ops/kernel/*.c",
|
||||
"src/custom/ops/kernel/evis/*.c",
|
||||
"src/custom/ops/kernel/cl/*.c",
|
||||
"src/custom/ops/kernel/cpu/*.c",
|
||||
])
|
||||
)
|
||||
|
||||
|
|
@ -128,7 +130,6 @@ cc_library(
|
|||
"include/quantization/vsi_nn_asymmetric_affine.h",
|
||||
"include/quantization/vsi_nn_dynamic_fixed_point.h",
|
||||
"include/quantization/vsi_nn_perchannel_symmetric_affine.h",
|
||||
"include/client/vsi_nn_vxkernel.h",
|
||||
"include/interface/ops.def",
|
||||
"include/kernel/vsi_nn_kernel.h",
|
||||
"include/kernel/vsi_nn_gpu.h",
|
||||
|
|
@ -139,6 +140,7 @@ cc_library(
|
|||
"include/vsi_nn_error.h",
|
||||
|
||||
# libnnext
|
||||
"include/libnnext/vsi_nn_vxkernel.h",
|
||||
"include/libnnext/vx_lib_nnext.h",
|
||||
"include/libnnext/vsi_nn_libnnext_resource.h",
|
||||
|
||||
|
|
@ -167,7 +169,6 @@ cc_library(
|
|||
"src/vsi_nn_daemon.c",
|
||||
"src/vsi_nn_graph_optimization.c",
|
||||
"src/vsi_nn_pre_post_process.c",
|
||||
"src/client/vsi_nn_vxkernel.c",
|
||||
"src/utils/vsi_nn_link_list.c",
|
||||
"src/utils/vsi_nn_util.c",
|
||||
"src/utils/vsi_nn_math.c",
|
||||
|
|
@ -200,12 +201,10 @@ cc_library(
|
|||
"src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_topk.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c",
|
||||
"src/libnnext/vsi_nn_libnnext_resource.c",
|
||||
"src/libnnext/vsi_nn_vxkernel.c",
|
||||
] + [":kernel_srcs"]
|
||||
+ [":operation_srcs"]
|
||||
+ [":custom_srcs"],
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ aux_source_directory(src/kernel/cpu INTERNAL_KERNEL_CPU)
|
|||
aux_source_directory(src/kernel/evis INTERNAL_KERNEL_EVIS)
|
||||
aux_source_directory(src/kernel/vx INTERNAL_KERNEL_VX)
|
||||
aux_source_directory(src/ops INTERNAL_OPS)
|
||||
aux_source_directory(src/client INTERNAL_CLIENT)
|
||||
aux_source_directory(src/libnnext INTERNAL_LIBNNEXT)
|
||||
aux_source_directory(src/libnnext/ops/kernel INTERNAL_LIBNNEXT_OPS_KERNEL)
|
||||
aux_source_directory(src/quantization INTERNAL_QUANTIZATION)
|
||||
|
|
@ -29,7 +28,6 @@ list(APPEND SRC
|
|||
${INTERNAL_KERNEL_EVIS}
|
||||
${INTERNAL_KERNEL_VX}
|
||||
${INTERNAL_OPS}
|
||||
${INTERNAL_CLIENT}
|
||||
${INTERNAL_LIBNNEXT}
|
||||
${INTERNAL_LIBNNEXT_OPS_KERNEL}
|
||||
${INTERNAL_QUANTIZATION}
|
||||
|
|
|
|||
|
|
@ -147,3 +147,12 @@ DEF_OP(DECONVOLUTION1D)
|
|||
DEF_OP(INTERP)
|
||||
DEF_OP(RESIZE_1D)
|
||||
DEF_OP(UPSAMPLESCALE)
|
||||
DEF_OP(GROUP_NORM)
|
||||
DEF_OP(ROUND)
|
||||
DEF_OP(CEIL)
|
||||
DEF_OP(SEQUENCE_MASK)
|
||||
DEF_OP(REPEAT)
|
||||
DEF_OP(ERF)
|
||||
DEF_OP(ONE_HOT)
|
||||
DEF_OP(NMS)
|
||||
DEF_OP(GROUPED_CONV1D)
|
||||
|
|
|
|||
|
|
@ -244,6 +244,12 @@ vsi_bool vsi_nn_kernel_param_add_buffer
|
|||
void * vsi_nn_kernel_param_get_buffer
|
||||
( const vsi_nn_kernel_param_t * params, const char * key, size_t * size);
|
||||
|
||||
vsi_bool vsi_nn_kernel_param_add_const_buffer
|
||||
( vsi_nn_kernel_param_t * params, const char * key, const void * buf, size_t size);
|
||||
|
||||
const void * vsi_nn_kernel_param_get_const_buffer
|
||||
( const vsi_nn_kernel_param_t * params, const char * key, size_t * size);
|
||||
|
||||
/** Kernel register */
|
||||
#define REGISTER_KERNEL_BACKEND(kernel_name, kernel_type, func) \
|
||||
_INITIALIZER(_register_kernel_##kernel_name##_##kernel_type) \
|
||||
|
|
|
|||
|
|
@ -30,17 +30,19 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_conv1d_lcl_data_t
|
||||
{
|
||||
vx_tensor input_tensor;
|
||||
vx_tensor weight_tensor;
|
||||
vx_tensor output_tensor;
|
||||
} vsi_nn_conv1d_lcl_data_t;
|
||||
|
||||
typedef struct _vsi_nn_conv1d_param
|
||||
{
|
||||
/* local data must be the first. */
|
||||
vsi_nn_conv1d_lcl_data_t local;
|
||||
union
|
||||
{
|
||||
struct _conv1d_local_data_t *local;
|
||||
|
||||
struct {
|
||||
vx_tensor input_tensor;
|
||||
vx_tensor weight_tensor;
|
||||
vx_tensor output_tensor;
|
||||
} reserved;
|
||||
};
|
||||
|
||||
uint32_t ksize;
|
||||
uint32_t stride;
|
||||
|
|
@ -53,6 +55,8 @@ typedef struct _vsi_nn_conv1d_param
|
|||
uint32_t dilation;
|
||||
int32_t multiplier;
|
||||
} vsi_nn_conv1d_param;
|
||||
_compiler_assert(offsetof(vsi_nn_conv1d_param, local) == 0, \
|
||||
vsi_nn_vsi_nn_conv1d_h );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_GROUPED_CONV1D_H
|
||||
#define _VSI_NN_OP_GROUPED_CONV1D_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
typedef struct _grouped_conv1d_local_data_t {
|
||||
vsi_nn_tensor_t* input;
|
||||
vsi_nn_tensor_t* weight;
|
||||
vsi_nn_tensor_t* output;
|
||||
|
||||
} grouped_conv1d_local_data_t;
|
||||
|
||||
typedef struct _vsi_nn_grouped_conv1d_param
|
||||
{
|
||||
grouped_conv1d_local_data_t *local;
|
||||
|
||||
uint32_t ksize;
|
||||
uint32_t stride;
|
||||
/* Pad left, right, top, bottom */
|
||||
uint32_t pad[2];
|
||||
/* Pad type default value shall be AUTO */
|
||||
vsi_nn_pad_e pad_type;
|
||||
uint32_t weights;
|
||||
uint32_t group;
|
||||
uint32_t dilation;
|
||||
int32_t multiplier;
|
||||
} vsi_nn_grouped_conv1d_param;
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#ifndef _VSI_NN_OP_CLIENT_GROUPNORMALIZE_H
|
||||
#define _VSI_NN_OP_CLIENT_GROUPNORMALIZE_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_groupnorm_lcl_data
|
||||
{
|
||||
/* handle 3D group norm */
|
||||
vsi_nn_tensor_t *reshaped_input;
|
||||
vsi_nn_tensor_t *reshaped_output;
|
||||
} vsi_nn_groupnorm_lcl_data;
|
||||
|
||||
typedef struct _vsi_nn_groupnormalize_param
|
||||
{
|
||||
/* local data must be the first. */
|
||||
vsi_nn_groupnorm_lcl_data* lcl_data;
|
||||
float eps;
|
||||
int32_t group_num;
|
||||
} vsi_nn_groupnormalize_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
@ -32,9 +32,9 @@ extern "C" {
|
|||
|
||||
typedef struct _vsi_nn_moments_param
|
||||
{
|
||||
int32_t* axis;
|
||||
int32_t axis_num;
|
||||
vsi_bool keep_dim;
|
||||
const int32_t* axis;
|
||||
int32_t axis_num;
|
||||
vsi_bool keep_dim;
|
||||
} vsi_nn_moments_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -21,10 +21,18 @@
|
|||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#ifndef _VSI_NN_POST_H
|
||||
#define _VSI_NN_POST_H
|
||||
|
||||
#include "post/vsi_nn_post_fasterrcnn.h"
|
||||
#include "post/vsi_nn_post_cmupose.h"
|
||||
#ifndef _VSI_NN_OP_NMS_H
|
||||
#define _VSI_NN_OP_NMS_H
|
||||
|
||||
#endif
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
typedef struct _vsi_nn_nms_param
|
||||
{
|
||||
int32_t max_output_size;
|
||||
float iou_threshold;
|
||||
float score_threshold;
|
||||
float soft_nms_sigma;
|
||||
} vsi_nn_nms_param;
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_ONE_HOT_H
|
||||
#define _VSI_NN_OP_ONE_HOT_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
typedef struct _vsi_nn_one_hot_param
|
||||
{
|
||||
struct _one_hot_local_data_t* local;
|
||||
|
||||
int32_t depth;
|
||||
float on_value;
|
||||
float off_value;
|
||||
int32_t axis;
|
||||
} vsi_nn_one_hot_param;
|
||||
_compiler_assert(offsetof(vsi_nn_one_hot_param, local) == 0, \
|
||||
vsi_nn_one_hot_h );
|
||||
|
||||
#endif
|
||||
|
|
@ -30,12 +30,12 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define _VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM 3
|
||||
|
||||
typedef struct _vsi_nn_poolwithargmax_lcl_data
|
||||
typedef struct _vsi_nn_pool_lcl_data
|
||||
{
|
||||
vx_tensor local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM];
|
||||
} vsi_nn_poolwithargmax_lcl_data;
|
||||
/* handle pool1d */
|
||||
vsi_nn_tensor_t *reshaped_input;
|
||||
vsi_nn_tensor_t *reshaped_output;
|
||||
} vsi_nn_pool_lcl_data;
|
||||
|
||||
typedef struct _vsi_nn_pool_param
|
||||
{
|
||||
|
|
@ -49,7 +49,7 @@ typedef struct _vsi_nn_pool_param
|
|||
/* Pad type default value shall be AUTO */
|
||||
vsi_nn_pad_e pad_type;
|
||||
/* poolwithargmax layer local data structure */
|
||||
vsi_nn_poolwithargmax_lcl_data local;
|
||||
vsi_nn_pool_lcl_data *local;
|
||||
} vsi_nn_pool_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
|||
|
|
@ -0,0 +1,54 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#ifndef _VSI_NN_OP_REPEAT_H
|
||||
#define _VSI_NN_OP_REPEAT_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_repeat_lcl_data
|
||||
{
|
||||
vsi_nn_tensor_t *repeat_tensor;
|
||||
vsi_nn_tensor_t *reshaped_input;
|
||||
vsi_nn_tensor_t *reshaped_output;
|
||||
} vsi_nn_repeat_lcl_data;
|
||||
|
||||
typedef struct _vsi_nn_repeat__param
|
||||
{
|
||||
vsi_nn_repeat_lcl_data* local;
|
||||
int32_t axis;
|
||||
int32_t maxlen; // default max repeat number
|
||||
int32_t* repeat_host; // host repeat array
|
||||
int32_t repeat_len; // length of host repeat array
|
||||
} vsi_nn_repeat_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#ifndef _VSI_NN_OP_SEQUENCE_MASK_H
|
||||
#define _VSI_NN_OP_SEQUENCE_MASK_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_sequence_mask__param
|
||||
{
|
||||
int32_t maxlen;
|
||||
} vsi_nn_sequence_mask_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -32,6 +32,22 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _strided_slice_param
|
||||
{
|
||||
int32_t *begin_dims;
|
||||
int32_t begin_dims_num;
|
||||
int32_t *end_dims;
|
||||
int32_t end_dims_num;
|
||||
int32_t *stride_dims;
|
||||
int32_t stride_dims_num;
|
||||
int32_t begin_mask;
|
||||
int32_t end_mask;
|
||||
int32_t shrink_axis_mask;
|
||||
int32_t new_axis_mask;
|
||||
|
||||
int32_t num_add_axis;
|
||||
} strided_slice_param;
|
||||
|
||||
typedef struct _vsi_nn_strided_slice_lcl_data2
|
||||
{
|
||||
vsi_nn_link_list_t link_list;
|
||||
|
|
@ -55,6 +71,8 @@ typedef struct _vsi_nn_strided_slice_lcl_data2
|
|||
|
||||
vsi_bool is_dataconvert_op;
|
||||
vsi_bool is_optimized;
|
||||
|
||||
strided_slice_param params;
|
||||
} vsi_nn_strided_slice_lcl_data2;
|
||||
|
||||
typedef struct _vsi_nn_strided_slice_lcl_data_t
|
||||
|
|
@ -78,6 +96,7 @@ typedef struct _vsi_nn_strided_slice_param
|
|||
vx_int32 begin_mask;
|
||||
vx_int32 end_mask;
|
||||
vx_int32 shrink_axis_mask;
|
||||
int32_t new_axis_mask;
|
||||
|
||||
vsi_nn_strided_slice_lcl_data2 * lcl2_data;
|
||||
} vsi_nn_strided_slice_param;
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ extern "C" {
|
|||
|
||||
typedef struct _vsi_nn_upsample_lcl_data
|
||||
{
|
||||
vx_tensor local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM];
|
||||
vx_tensor local_tensor[_VSI_NN_UPSAMPLE_LOCAL_TENSOR_NUM];
|
||||
} vsi_nn_upsample_lcl_data;
|
||||
|
||||
typedef struct _vsi_nn_upsample_param
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ vsi_bool is_item_in_array
|
|||
enum { NAME##_INPUT_COUNT = INPUT_COUNT, \
|
||||
NAME##_OUTPUT_COUNT = OUTPUT_COUNT, \
|
||||
NAME##_IO_COUNT = NAME##_INPUT_COUNT + NAME##_OUTPUT_COUNT}; \
|
||||
static const struct {vsi_nn_type_e types[NAME##_IO_COUNT];} \
|
||||
static const struct {int types[NAME##_IO_COUNT];} \
|
||||
NAME##_supported_io_types[] = {
|
||||
|
||||
#define DECL_OP_CONSTRAINT_REG(NAME) \
|
||||
|
|
|
|||
|
|
@ -438,6 +438,7 @@ static inline vsi_status float32_to_dtype
|
|||
case VSI_NN_TYPE_UINT8:
|
||||
case VSI_NN_TYPE_INT16:
|
||||
case VSI_NN_TYPE_INT32:
|
||||
case VSI_NN_TYPE_UINT32:
|
||||
{
|
||||
int32_t dst_value = 0;
|
||||
switch( dst_dtype->qnt_type )
|
||||
|
|
|
|||
|
|
@ -165,6 +165,8 @@ struct _vsi_nn_graph
|
|||
* so please keep it NULL.*/
|
||||
vsi_nn_tensor_t* tensor;
|
||||
} complete_signal;
|
||||
|
||||
vsi_bool isAllowFastMode;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -716,6 +718,16 @@ OVXLIB_API vsi_status vsi_nn_SetGraphPriority
|
|||
uint32_t priority
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_status vsi_nn_SetGraphFastMode
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
vsi_bool fastmode
|
||||
);
|
||||
|
||||
OVXLIB_API vsi_bool vsi_nn_IsGraphFastMode
|
||||
(
|
||||
const vsi_nn_graph_t* graph
|
||||
);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -164,6 +164,12 @@
|
|||
#include "ops/vsi_nn_op_resize_1d_bilinear_internal.h"
|
||||
#include "ops/vsi_nn_op_resize_1d_nearest_internal.h"
|
||||
#include "ops/vsi_nn_op_upsamplescale.h"
|
||||
#include "ops/vsi_nn_op_groupnormalize.h"
|
||||
#include "ops/vsi_nn_op_sequence_mask.h"
|
||||
#include "ops/vsi_nn_op_repeat.h"
|
||||
#include "ops/vsi_nn_op_one_hot.h"
|
||||
#include "ops/vsi_nn_op_nms.h"
|
||||
#include "ops/vsi_nn_op_grouped_conv1d.h"
|
||||
/* custom node head define define */
|
||||
#include "custom/vsi_nn_custom_node_type.h"
|
||||
|
||||
|
|
@ -314,6 +320,12 @@ typedef union _vsi_nn_nn_param
|
|||
vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal;
|
||||
vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal;
|
||||
vsi_nn_upsamplescale_param upsamplescale;
|
||||
vsi_nn_groupnormalize_param groupnorm;
|
||||
vsi_nn_sequence_mask_param sequence_mask;
|
||||
vsi_nn_repeat_param repeat;
|
||||
vsi_nn_one_hot_param one_hot;
|
||||
vsi_nn_nms_param nms;
|
||||
vsi_nn_grouped_conv1d_param grouped_conv1d;
|
||||
uint8_t client_param[128];
|
||||
|
||||
/* custom node data struct define */
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ extern "C"{
|
|||
|
||||
#define VSI_NN_VERSION_MAJOR 1
|
||||
#define VSI_NN_VERSION_MINOR 1
|
||||
#define VSI_NN_VERSION_PATCH 30
|
||||
#define VSI_NN_VERSION_PATCH 32
|
||||
#define VSI_NN_VERSION \
|
||||
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,4 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
__kernel void vxcTopk(
|
||||
__kernel void testop(
|
||||
__read_only image2d_array_t input,
|
||||
__write_only image2d_array_t output)
|
||||
{
|
||||
|
|
@ -0,0 +1,194 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_test.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
//#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _CPU_ARG_NUM (1)
|
||||
#define _CPU_INPUT_NUM (1)
|
||||
#define _CPU_OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME ("com.vivantecorp.extension.CustomSoftmaxVXC")
|
||||
|
||||
#define SCALAR_INPUT_AXIS (2)
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_softmax_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t* param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_SUCCESS;
|
||||
float* buffer[_CPU_IO_NUM] = { NULL };
|
||||
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
|
||||
vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
|
||||
uint32_t i = 0;
|
||||
uint32_t out_elements;
|
||||
int32_t sf_axis;
|
||||
float fMax = 0.0;
|
||||
float fProbSum = 0.0f;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &sf_axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
out_elements = (uint32_t)vsi_nn_kernel_tensor_attr_get_size( attr[1] );
|
||||
|
||||
/* alloc the float32 data buffer */
|
||||
buffer[1] = (float *)malloc(out_elements * sizeof(float));
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
|
||||
memset(buffer[1], 0, out_elements * sizeof(float));
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
|
||||
|
||||
/* Softmax implement */
|
||||
for ( i = 0; i < out_elements; i++)
|
||||
{
|
||||
fMax = buffer[0][i] > fMax ? buffer[0][i] : fMax;
|
||||
}
|
||||
|
||||
for ( i = 0; i < out_elements; i++)
|
||||
{
|
||||
buffer[1][i] = (float)expf(buffer[0][i] - fMax);
|
||||
fProbSum += buffer[1][i];
|
||||
}
|
||||
for ( i = 0; i < out_elements; i++)
|
||||
{
|
||||
buffer[1][i] = buffer[1][i] / fProbSum;
|
||||
}
|
||||
status = vsi_nn_kernel_tensor_write_from_float(
|
||||
tensors[1], attr[1], buffer[1], out_elements );
|
||||
|
||||
final:
|
||||
for( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[i] );
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
static vx_param_description_t kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
|
||||
};
|
||||
|
||||
static const vx_kernel_description_t _kernel_info =
|
||||
{
|
||||
KERNEL_ID_PLACEHOLDER,
|
||||
_KERNEL_NAME,
|
||||
_softmax_exec,
|
||||
kernel_param_def,
|
||||
_cnt_of_array( kernel_param_def ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
|
||||
return VSI_SUCCESS;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t axis = 0;
|
||||
|
||||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel );
|
||||
if( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
|
||||
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
|
||||
backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &axis );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( custom_softmax, _setup )
|
||||
|
|
@ -34,6 +34,7 @@ __kernel void Softmax2VXC
|
|||
}
|
||||
|
||||
float fProbSum = 0.0f;
|
||||
vxc_short8 dst;
|
||||
for (int i = 0; i < sf_size; i++)
|
||||
{
|
||||
vxc_char8 val;
|
||||
|
|
@ -47,7 +48,8 @@ __kernel void Softmax2VXC
|
|||
fProbSum += fOut;
|
||||
half hVal;
|
||||
_viv_asm(CONV,hVal,fOut);
|
||||
VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY,dst,hVal, 4);
|
||||
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
for (int i = 0; i < sf_size; i++)
|
||||
|
|
@ -63,7 +65,8 @@ __kernel void Softmax2VXC
|
|||
float fOut =fval/fProbSum;
|
||||
half hVal;
|
||||
_viv_asm(CONV,hVal,fOut);
|
||||
VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY,dst,hVal, 4);
|
||||
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_test.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
//#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _CPU_ARG_NUM (1)
|
||||
#define _CPU_INPUT_NUM (1)
|
||||
#define _CPU_OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC")
|
||||
|
||||
#define SCALAR_INPUT_AXIS (2)
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
static vx_param_description_t kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
|
||||
};
|
||||
#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def)
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_softmax_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t* param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
int sf_size = 0;
|
||||
vsi_nn_kernel_tensor_attr_t* attr = NULL;
|
||||
// Alignment with a power of two value.
|
||||
gpu_param_t gpu_param = {
|
||||
2, // workdim
|
||||
{0, 0, 0}, // global_offset: control the start location be processed in the image
|
||||
{0, 0, 0}, // global_scale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // local_size: local group size in thread
|
||||
{0, 0, 0}}; // global_size: image size in thread
|
||||
|
||||
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
if (!attr)
|
||||
{
|
||||
VSILOGE("Query failure! at line");
|
||||
return status;
|
||||
}
|
||||
|
||||
sf_size = attr->shape->data[0];
|
||||
|
||||
gpu_param.global_offset[0] = 0;
|
||||
gpu_param.global_offset[1] = 0;
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.local_size[0] = 1;
|
||||
gpu_param.local_size[1] = 1;
|
||||
gpu_param.global_size[0] =
|
||||
gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
|
||||
gpu_param.local_size[0]);
|
||||
gpu_param.global_size[1] =
|
||||
gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
|
||||
gpu_param.local_size[1]);
|
||||
{
|
||||
gpu_dp_inst_t Uni4x4_Fp16ToFp32 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
|
||||
vsi_nn_kernel_gpu_add_param(node,
|
||||
"sf_size", &sf_size);
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
if(status != VSI_SUCCESS)
|
||||
{
|
||||
VSILOGE("Initializer failure!");
|
||||
}
|
||||
if (attr) vsi_nn_kernel_tensor_attr_release( &attr );
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static const vx_kernel_description_t _kernel_info =
|
||||
{
|
||||
KERNEL_ID_PLACEHOLDER,
|
||||
_KERNEL_NAME,
|
||||
NULL,
|
||||
kernel_param_def,
|
||||
_cnt_of_array( kernel_param_def ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
_softmax_initializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
|
||||
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
"custom_softmax" );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
"custom_softmax" );
|
||||
return VSI_SUCCESS;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t axis = 0;
|
||||
|
||||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel );
|
||||
if( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
|
||||
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
|
||||
backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &axis );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( custom_softmax, _setup )
|
||||
|
|
@ -1,231 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_test.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _VX_KERNEL_ID VX_KERNEL_ID(CUSTOM_SOFTMAX)
|
||||
#define _VX_KERNEL_VAR_CPU (vx_client_kernel_CUSTOM_SOFTMAX_CPU)
|
||||
#define _VX_KERNEL_VAR_VX (vx_client_kernel_CUSTOM_SOFTMAX_VX)
|
||||
#define _VX_KERNEL_NAME ("com.vivantecorp.extension.CustomSoftmaxVXC")
|
||||
#define _VX_KERNEL_FUNC_KERNEL (vxCustomSoftmaxKernel)
|
||||
|
||||
static vsi_status VX_CALLBACK vxCustomSoftmaxKernel
|
||||
(
|
||||
vx_node node,
|
||||
const vx_reference* paramObj,
|
||||
uint32_t paramNum
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_SUCCESS;
|
||||
vx_tensor input = NULL,output = NULL;
|
||||
float *f32_in_buffer = NULL,*f32_out_buffer=NULL;
|
||||
vx_context context = NULL;
|
||||
vsi_nn_tensor_attr_t in_attr,out_attr;
|
||||
uint32_t i,in_elements,out_elements;
|
||||
int32_t sf_axis;
|
||||
float fMax = 0.0;
|
||||
float fProbSum = 0.0f;
|
||||
|
||||
context = vxGetContext((vx_reference)node);
|
||||
input = (vx_tensor)paramObj[0];
|
||||
output = (vx_tensor)paramObj[1];
|
||||
vxCopyScalar((vx_scalar)paramObj[2], &(sf_axis),VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
|
||||
/* Fill input & output attribute data struct */
|
||||
status = vsi_nn_vxGetTensorAttr(input, &in_attr);
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
status = vsi_nn_vxGetTensorAttr(output, &out_attr);
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
|
||||
in_elements = vsi_nn_vxGetTensorElementNum(&in_attr);
|
||||
out_elements = vsi_nn_vxGetTensorElementNum(&out_attr);
|
||||
|
||||
/* alloc the float32 data buffer */
|
||||
f32_in_buffer = (float *)malloc(in_elements * sizeof(float));
|
||||
f32_out_buffer= (float *)malloc(out_elements * sizeof(float));
|
||||
memset(f32_in_buffer, 0, in_elements * sizeof(float));
|
||||
memset(f32_out_buffer, 0, out_elements * sizeof(float));
|
||||
|
||||
/* Copy tensor to buffer, and convert bufer to float32 format */
|
||||
status = vsi_nn_vxConvertTensorToFloat32Data(
|
||||
context, input, &in_attr, f32_in_buffer, in_elements * sizeof(float));
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
|
||||
/* Softmax implement */
|
||||
for ( i = 0; i < out_elements; i++)
|
||||
{
|
||||
fMax = f32_in_buffer[i] > fMax ? f32_in_buffer[i] : fMax;
|
||||
}
|
||||
|
||||
for ( i = 0; i < out_elements; i++)
|
||||
{
|
||||
f32_out_buffer[i] = (float)expf(f32_in_buffer[i] - fMax);
|
||||
fProbSum += f32_out_buffer[i];
|
||||
}
|
||||
for ( i = 0; i < out_elements; i++)
|
||||
{
|
||||
f32_out_buffer[i] = f32_out_buffer[i]/ fProbSum;
|
||||
}
|
||||
status = vsi_nn_vxConvertFloat32DataToTensor(
|
||||
context, output, &out_attr, f32_out_buffer, out_elements * sizeof(float));
|
||||
|
||||
final:
|
||||
if(f32_in_buffer)free(f32_in_buffer);
|
||||
if(f32_out_buffer)free(f32_out_buffer);
|
||||
return status;
|
||||
}
|
||||
|
||||
static vx_status VX_CALLBACK vxCustomSoftmaxInitializer
|
||||
(
|
||||
vx_node nodObj,
|
||||
const vx_reference *paramObj,
|
||||
vx_uint32 paraNum
|
||||
)
|
||||
{
|
||||
vx_status status = VX_SUCCESS;
|
||||
/*TODO: Add initial code for VX program*/
|
||||
// Alignment with a power of two value.
|
||||
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
|
||||
vx_kernel_execution_parameters_t shaderParam = {
|
||||
2, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
int input_size[6] = {1, 1, 1, 1, 1, 1};
|
||||
int sf_size;
|
||||
uint32_t input_dims;
|
||||
uint32_t i;
|
||||
vsi_nn_tensor_attr_t input_attr;
|
||||
|
||||
memset(&input_attr, 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
|
||||
status = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[0], &input_attr);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
|
||||
return status;
|
||||
}
|
||||
|
||||
input_dims = input_attr.dim_num;
|
||||
for (i = 0; i < input_dims; i++)
|
||||
{
|
||||
input_size[i] = input_attr.size[i];
|
||||
}
|
||||
|
||||
sf_size = input_size[0];
|
||||
|
||||
shaderParam.globalWorkOffset[0] = 0;
|
||||
shaderParam.globalWorkOffset[1] = 0;
|
||||
shaderParam.globalWorkScale[0] = 1;
|
||||
shaderParam.globalWorkScale[1] = 1;
|
||||
shaderParam.localWorkSize[0] = 1;
|
||||
shaderParam.localWorkSize[1] = 1;
|
||||
shaderParam.globalWorkSize[0] =
|
||||
gcmALIGN((1 + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
|
||||
shaderParam.globalWorkSize[1] =
|
||||
gcmALIGN((1 + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
|
||||
{
|
||||
vx_uint32 Uni4x4_Fp16ToFp32[16] = {
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
};
|
||||
|
||||
vxSetNodeUniform(nodObj, "Uni4x4_Fp16ToFp32", 1, Uni4x4_Fp16ToFp32);
|
||||
vxSetNodeUniform(nodObj, "sf_size", 1, &sf_size);
|
||||
}
|
||||
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
|
||||
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
|
||||
|
||||
if(status < 0)
|
||||
{
|
||||
VSILOGE("Initializer failure!");
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static vx_param_description_t s_params[] =
|
||||
{
|
||||
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
|
||||
{ VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
|
||||
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
|
||||
};
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
vx_kernel_description_t _VX_KERNEL_VAR_CPU =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
_VX_KERNEL_NAME,
|
||||
_VX_KERNEL_FUNC_KERNEL,
|
||||
s_params,
|
||||
_cnt_of_array( s_params ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t _VX_KERNEL_VAR_VX =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
_VX_KERNEL_NAME,
|
||||
NULL,
|
||||
s_params,
|
||||
_cnt_of_array( s_params ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxCustomSoftmaxInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[] =
|
||||
{
|
||||
&_VX_KERNEL_VAR_CPU,
|
||||
&_VX_KERNEL_VAR_VX,
|
||||
NULL
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -0,0 +1,102 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdlib.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_ops.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
static vsi_status op_compute
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_param_t * param = NULL;
|
||||
vsi_nn_custom_softmax_param * p;
|
||||
p = &(self->nn_param.custom_softmax);
|
||||
|
||||
param = vsi_nn_kernel_param_create();
|
||||
vsi_nn_kernel_param_add_int32( param, "axis", p->axis );
|
||||
|
||||
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
|
||||
"custom_softmax",
|
||||
inputs, 1,
|
||||
outputs, 1, param );
|
||||
|
||||
vsi_nn_kernel_param_release( ¶m );
|
||||
|
||||
return VSI_SUCCESS;
|
||||
} /* op_compute() */
|
||||
|
||||
static vsi_bool op_check
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
/*TODO: Check params. */
|
||||
return TRUE;
|
||||
} /* op_check() */
|
||||
|
||||
static vsi_bool op_setup
|
||||
(
|
||||
vsi_nn_node_t * node,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||
{
|
||||
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
|
||||
memmove(outputs[0]->attr.size, inputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num * sizeof(uint32_t));
|
||||
}
|
||||
return TRUE;
|
||||
} /* op_setup() */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
/* Registrar */
|
||||
DEF_OP_REG
|
||||
(
|
||||
/* op_name */ CUSTOM_SOFTMAX,
|
||||
/* init */ NULL,
|
||||
/* compute */ op_compute,
|
||||
/* deinit */ vsi_nn_op_common_deinit,
|
||||
/* check */ op_check,
|
||||
/* setup */ op_setup,
|
||||
/* optimize */ NULL,
|
||||
/* input_num */ 1,
|
||||
/* output_num */ 1
|
||||
);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,299 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdlib.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_platform.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_node.h"
|
||||
#include "vsi_nn_ops.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
|
||||
#define _ARG_NUM (1)
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
|
||||
#define _PARAM_NUM (_ARG_NUM + _IO_NUM)
|
||||
|
||||
extern vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[];
|
||||
|
||||
static void _set_inputs_outputs
|
||||
(
|
||||
vx_reference * params,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t cnt;
|
||||
|
||||
/* Set inputs */
|
||||
cnt = 0;
|
||||
for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
|
||||
{
|
||||
params[cnt] = (vx_reference)inputs[i]->t;
|
||||
}
|
||||
|
||||
/* Set outputs */
|
||||
for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
|
||||
{
|
||||
params[cnt] = (vx_reference)outputs[i]->t;
|
||||
}
|
||||
} /* _set_inputs_outputs() */
|
||||
|
||||
static vsi_status _create_params
|
||||
(
|
||||
vsi_nn_node_t * node,
|
||||
vx_reference * params,
|
||||
uint32_t num
|
||||
)
|
||||
{
|
||||
vsi_status status;
|
||||
vx_context ctx;
|
||||
vsi_nn_custom_softmax_param * p;
|
||||
if( 0 == num )
|
||||
{
|
||||
return VSI_SUCCESS;
|
||||
}
|
||||
memset( params, 0, sizeof( vx_reference * ) * num );
|
||||
p = &(node->nn_param.custom_softmax);
|
||||
ctx = vxGetContext( (vx_reference)node->graph->g );
|
||||
/* Init parameters */
|
||||
#define _SET_PARAM( i, type, arg ) do{ \
|
||||
params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
|
||||
status = vxGetStatus( params[i] ); \
|
||||
if( VSI_SUCCESS != status ) { \
|
||||
goto set_param_error; \
|
||||
} \
|
||||
} while(0)
|
||||
_SET_PARAM( 0, VX_TYPE_INT32, axis );
|
||||
#undef _SET_PARAM
|
||||
set_param_error:
|
||||
|
||||
return status;
|
||||
} /* _create_params */
|
||||
|
||||
static void _release_params
|
||||
(
|
||||
vx_reference * params,
|
||||
uint32_t num
|
||||
)
|
||||
{
|
||||
uint32_t i;
|
||||
vx_scalar scalar;
|
||||
for( i = 0; i < num; i ++ )
|
||||
{
|
||||
scalar = (vx_scalar)params[i];
|
||||
vxReleaseScalar( &scalar );
|
||||
}
|
||||
} /* _release_params() */
|
||||
|
||||
static vsi_status cpu_op_compute
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
vx_reference params[_PARAM_NUM];
|
||||
vx_reference * args;
|
||||
|
||||
args = ¶ms[_IO_NUM];
|
||||
|
||||
if( NULL == self->n )
|
||||
{
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
/* Set inputs and outputs */
|
||||
_set_inputs_outputs( params, inputs, outputs );
|
||||
|
||||
/* Init parameters. */
|
||||
_create_params( self, args, _ARG_NUM );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
|
||||
|
||||
_release_params( args, _ARG_NUM );
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static vsi_status vx_op_compute
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
vx_reference params[_PARAM_NUM];
|
||||
vx_reference * args;
|
||||
//vsi_nn_tensor_attr_t attr;
|
||||
|
||||
args = ¶ms[_IO_NUM];
|
||||
|
||||
if( NULL == self->n )
|
||||
{
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
/* Set inputs and outputs */
|
||||
_set_inputs_outputs( params, inputs, outputs );
|
||||
|
||||
/*TODO: Add code if need to change your parameter*/
|
||||
/* Init parameters. */
|
||||
_create_params( self, args, _ARG_NUM );
|
||||
#if 0
|
||||
memcpy(&attr, &(inputs[0]->attr), sizeof(vsi_nn_tensor_attr_t));
|
||||
attr.size[0] = attr.size[0];
|
||||
attr.size[1] = 1;
|
||||
attr.dim_num = 2;
|
||||
params[0] = (vx_reference)vxReshapeTensor(inputs[0]->t, (int32_t*)(attr.size), attr.dim_num);
|
||||
params[1] = (vx_reference)vxReshapeTensor(outputs[0]->t, (int32_t*)(attr.size), attr.dim_num);
|
||||
#endif
|
||||
/* Init parameters. */
|
||||
_create_params( self, args, _ARG_NUM );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
|
||||
|
||||
_release_params( args, _ARG_NUM );
|
||||
#if 0
|
||||
vxReleaseTensor((vx_tensor*)¶ms[0]);
|
||||
vxReleaseTensor((vx_tensor*)¶ms[1]);
|
||||
#endif
|
||||
return status;
|
||||
}
|
||||
|
||||
static vsi_nn_op_compute_t op_compute_list[] =
|
||||
{
|
||||
cpu_op_compute,
|
||||
vx_op_compute,
|
||||
NULL
|
||||
};
|
||||
|
||||
static vsi_status op_compute
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
vsi_status status;
|
||||
vsi_nn_kernel_info_t kernel_info;
|
||||
char *path = NULL;
|
||||
|
||||
memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
|
||||
status = VSI_FAILURE;
|
||||
kernel_info.type = VX_KERNEL_TYPE_CPU;
|
||||
kernel_info.kernel = vx_kernel_CUSTOM_SOFTMAX_list;
|
||||
kernel_info.resource_num = 1;
|
||||
kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
|
||||
kernel_info.resource_name[0] = "vsi_nn_kernel_custom_softmax";
|
||||
path = getenv("USER_VX_SOURCE_PATH");
|
||||
if(path)
|
||||
{
|
||||
vsi_nn_VxResourceSetPath(path);
|
||||
}
|
||||
|
||||
if( kernel_info.type == VX_KERNEL_TYPE_VX)
|
||||
{
|
||||
kernel_info.kernel_index = 1;
|
||||
kernel_info.init_index = 1;
|
||||
}
|
||||
else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/
|
||||
{
|
||||
kernel_info.kernel_index = 0;
|
||||
kernel_info.init_index = 0;
|
||||
}
|
||||
|
||||
self->n = vsi_nn_RegisterClientKernelAndNewNode(
|
||||
self->graph, &kernel_info);
|
||||
if (kernel_info.resource_name)
|
||||
{
|
||||
free(kernel_info.resource_name);
|
||||
}
|
||||
if( NULL == self->n )
|
||||
{
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
if (NULL != op_compute_list[kernel_info.init_index])
|
||||
{
|
||||
status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* op_compute() */
|
||||
|
||||
static vsi_bool op_check
|
||||
(
|
||||
vsi_nn_node_t * self,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
/*TODO: Check input tensor shapes. */
|
||||
return TRUE;
|
||||
} /* op_check() */
|
||||
|
||||
static vsi_bool op_setup
|
||||
(
|
||||
vsi_nn_node_t * node,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs
|
||||
)
|
||||
{
|
||||
/* TODO: Compute output tensor shape. */
|
||||
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
|
||||
{
|
||||
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
|
||||
outputs[0]->attr.size[0] = inputs[0]->attr.size[0];
|
||||
outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
|
||||
outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
|
||||
outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
|
||||
}
|
||||
return TRUE;
|
||||
} /* op_setup() */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
/* Registrar */
|
||||
DEF_OP_REG
|
||||
(
|
||||
/* op_name */ CUSTOM_SOFTMAX,
|
||||
/* init */ NULL,
|
||||
/* compute */ op_compute,
|
||||
/* deinit */ vsi_nn_op_common_deinit,
|
||||
/* check */ op_check,
|
||||
/* setup */ op_setup,
|
||||
/* optimize */ NULL,
|
||||
/* input_num */ _INPUT_NUM,
|
||||
/* output_num */ _OUTPUT_NUM
|
||||
);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -183,26 +183,31 @@ static vsi_status _query_kernel
|
|||
vsi_nn_kernel_dtype_e output_dtype;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
uint32_t key;
|
||||
int i;
|
||||
int32_t i;
|
||||
|
||||
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if(input_dtype == I8)
|
||||
if (input_dtype == I8)
|
||||
{
|
||||
input_dtype = I32;
|
||||
}
|
||||
|
||||
if (output_dtype == I16)
|
||||
{
|
||||
output_dtype = I32;
|
||||
}
|
||||
|
||||
key = HASH_ARGMAX_KEY( axis, input_dtype, output_dtype, image_2d );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
|
||||
for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
|
||||
{
|
||||
if( kernel_map[i].key == key )
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( i < _cnt_of_array(kernel_map) )
|
||||
if ( i < _cnt_of_array(kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = kernel_param_def;
|
||||
|
|
@ -237,7 +242,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num )
|
||||
|| !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num )
|
||||
|
|
@ -250,11 +255,11 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
|
||||
status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
|
||||
if( VSI_SUCCESS == status)
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
||||
if( node )
|
||||
if ( node )
|
||||
{
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
|
||||
inputs, 1, outputs, 1 );
|
||||
|
|
|
|||
|
|
@ -183,20 +183,26 @@ static vsi_status _query_kernel
|
|||
vsi_nn_kernel_dtype_e output_dtype;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
uint32_t key;
|
||||
int i;
|
||||
int32_t i;
|
||||
|
||||
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (output_dtype == I16)
|
||||
{
|
||||
output_dtype = I32;
|
||||
}
|
||||
|
||||
key = HASH_ARGMIN_KEY( axis, input_dtype, output_dtype, image_2d );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
|
||||
for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
|
||||
{
|
||||
if( kernel_map[i].key == key )
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( i < _cnt_of_array(kernel_map) )
|
||||
if ( i < _cnt_of_array(kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = kernel_param_def;
|
||||
|
|
@ -231,7 +237,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
axis = vsi_nn_kernel_param_get_int32(params, "axis");
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
|
||||
inputs[0]->attr.dim_num )
|
||||
|| !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num )
|
||||
|
|
@ -244,11 +250,11 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
|
||||
status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
|
||||
if( VSI_SUCCESS == status)
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
||||
if( node )
|
||||
if ( node )
|
||||
{
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
|
||||
inputs, 1, outputs, 1 );
|
||||
|
|
|
|||
|
|
@ -186,7 +186,7 @@ static vsi_status _query_kernel
|
|||
{
|
||||
in_dtype = F32;
|
||||
}
|
||||
else if ((I8 == in_dtype) || (I16 == in_dtype))
|
||||
else if ((I8 == in_dtype) || (BOOL8 == in_dtype) || (I16 == in_dtype))
|
||||
{
|
||||
in_dtype = I32;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -289,6 +289,12 @@ static vsi_status _query_kernel
|
|||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8)
|
||||
{
|
||||
output_dtype = BOOL8;
|
||||
}
|
||||
|
||||
key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(_comparisons_cl_kernel_map); i ++ )
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ typedef enum
|
|||
UNARY_NEG,
|
||||
UNARY_HSIGMOID,
|
||||
UNARY_MISH,
|
||||
UNARY_ROUND,
|
||||
} unary_type_e;
|
||||
|
||||
/*
|
||||
|
|
@ -91,7 +92,8 @@ typedef enum
|
|||
#define ELU_OPERATION elu
|
||||
#define NEG_OPERATION neg
|
||||
#define HSIGMOID_OPERATION hard_sigmoid
|
||||
#define MISH_OPERATION mish
|
||||
#define MISH_OPERATION mish
|
||||
#define ROUND_OPERATION round
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
|
|
@ -113,6 +115,8 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F16, F16)
|
||||
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F16, F16)
|
||||
|
|
@ -128,6 +132,8 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16)
|
||||
|
||||
TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, U8)
|
||||
TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, U8)
|
||||
|
|
@ -136,6 +142,7 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, U8, U8)
|
||||
TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
|
||||
TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8, U8)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, U8)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8)
|
||||
|
|
@ -144,6 +151,7 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8)
|
||||
|
||||
TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32, I32)
|
||||
|
||||
|
|
@ -157,6 +165,7 @@ static const struct {
|
|||
#undef NEG_OPERATION
|
||||
#undef HSIGMOID_OPERATION
|
||||
#undef MISH_OPERATION
|
||||
#undef ROUND_OPERATION
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
|
|
@ -407,5 +416,5 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( elu, UNARY_ELU )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_CL( neg, UNARY_NEG )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( mish, UNARY_MISH )
|
||||
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( round, UNARY_ROUND )
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -0,0 +1,328 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define HASH_UNARY_KEY(_input_type, _output_type, _image_2d) \
|
||||
( (_input_type << 12) | (_output_type << 4) | (_image_2d))
|
||||
|
||||
#define VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() \
|
||||
"erf"
|
||||
|
||||
#define HASH_UNARY_SH_KERNEL_NAME( SRC_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.erf_"#SRC_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define TENSOR_UNARY_KERNELS(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \
|
||||
HASH_UNARY_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE), \
|
||||
VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
|
||||
|
||||
#define HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.erf_"#SRC_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
#define TENSOR_UNARY_KERNELS_2D(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \
|
||||
HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE), \
|
||||
VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
|
||||
|
||||
#define TENSOR_UNARY_KERNELS_FLOAT(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \
|
||||
HASH_UNARY_SH_KERNEL_NAME(F32, F32), \
|
||||
VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
|
||||
|
||||
#define TENSOR_UNARY_KERNELS_FLOAT_2D(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \
|
||||
HASH_UNARY_SH_KERNEL_2D_NAME(F32, F32), \
|
||||
VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _erf_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_UNARY_KERNELS_FLOAT(F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(F16, F16)
|
||||
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(F16, F16)
|
||||
|
||||
TENSOR_UNARY_KERNELS(U8, U8)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(U8, U8)
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _erf_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define SCALAR_INPUT_SCALE (2)
|
||||
#define SCALAR_INPUT_TAIL (3)
|
||||
#define SCALAR_OUTPUT_SCALE (4)
|
||||
#define SCALAR_OUTPUT_ZP (5)
|
||||
#define _ERF_PARAM_NUM _cnt_of_array( _erf_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_erf_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0} // globalWorkSize: image size in thread
|
||||
};
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_int_array_t * out_shape = NULL;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(attr[0]);
|
||||
SAFE_FREE_TENSOR_ATTR(attr[1]);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
|
||||
return status;
|
||||
} /* _erf_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool image_2d
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _erf_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _erf_kernel_map );
|
||||
vx_param_description_t * param_def = _erf_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _erf_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_UNARY_KEY( in_dtype, out_dtype, image_2d );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _erf_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
|
||||
int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||
int32_t new_rank = 0;
|
||||
vsi_bool ret = FALSE;
|
||||
vsi_bool image_2d = FALSE;
|
||||
|
||||
float inputScale = inputs[0]->attr.dtype.scale;
|
||||
float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
|
||||
float outputScale = outputs[0]->attr.dtype.scale;
|
||||
float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
|
||||
|
||||
ret = vsi_nn_kernel_optimize_element_shape(
|
||||
(int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
shape, &new_rank );
|
||||
if ( ret )
|
||||
{
|
||||
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], (uint32_t*)shape, new_rank );
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], (uint32_t*)shape, new_rank );
|
||||
}
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size,
|
||||
rs_tensors[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
|
||||
|
||||
image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1);
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, image_2d );
|
||||
if ( VSI_SUCCESS == status )
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
|
||||
rs_tensors, 1, &rs_tensors[1], 1 );
|
||||
|
||||
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &inputScale );
|
||||
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &inputTail );
|
||||
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &outputScale );
|
||||
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &outputZP );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
|
||||
CHECK_STATUS_FAIL_GOTO( status, OnError );
|
||||
}
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (rs_tensors[0])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[0] );
|
||||
}
|
||||
|
||||
if (rs_tensors[1])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[1] );
|
||||
}
|
||||
|
||||
if (node_params[SCALAR_INPUT_SCALE])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
|
||||
}
|
||||
|
||||
if (node_params[SCALAR_INPUT_TAIL])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
|
||||
}
|
||||
|
||||
if (node_params[SCALAR_OUTPUT_SCALE])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
|
||||
}
|
||||
|
||||
if (node_params[SCALAR_OUTPUT_ZP])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( erf, _setup )
|
||||
|
|
@ -68,11 +68,15 @@ static const _kernel_map_type _floordiv_kernel_map[] =
|
|||
// Register kernel here
|
||||
FLOORDIV_KERNELS( F32, F32, F32 )
|
||||
FLOORDIV_KERNELS( I32, I32, I32 )
|
||||
FLOORDIV_KERNELS( I32, I32, U8 )
|
||||
FLOORDIV_KERNELS( U8, U8, U8 )
|
||||
FLOORDIV_KERNELS( U8, I32, U8 )
|
||||
|
||||
FLOORDIV_KERNELS_2D( F32, F32, F32 )
|
||||
FLOORDIV_KERNELS_2D( I32, I32, I32 )
|
||||
FLOORDIV_KERNELS_2D( I32, I32, U8 )
|
||||
FLOORDIV_KERNELS_2D( U8, U8, U8 )
|
||||
FLOORDIV_KERNELS_2D( U8, I32, U8 )
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -311,4 +315,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( floordiv, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,760 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_SUM_SQR,
|
||||
INTERNAL_KERNEL_MEAN_VARI,
|
||||
INTERNAL_KERNEL_NORM,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define KERNEL_SOURCE_1 "group_normalization_u8"
|
||||
#define KERNEL_SOURCE_2 "group_normalization_f32"
|
||||
#define KERNEL_SOURCE_3 "group_normalization_i32"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_GROUPNORM_SUM_SQR_KERNEL_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.group_norm_sumsqr_"#SRC0_TYPE)
|
||||
|
||||
#define HASH_GROUPNORM_SUM_SQR_KERNEL_2D_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.group_norm_sumsqr_"#SRC0_TYPE"_2D")
|
||||
|
||||
#define HASH_GROUPNORM_MEAN_VARI_KERNEL_NAME \
|
||||
CVIVANTE_NAMESPACE("cl.group_norm_meanvari")
|
||||
|
||||
#define HASH_GROUPNORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.group_norm_"#SRC0_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define HASH_GROUPNORM_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
// Add kernel hashtable here
|
||||
// sum sqr
|
||||
#define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
|
||||
|
||||
#define TENSOR_GROUPNORM_SUM_SQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 0), \
|
||||
HASH_GROUPNORM_SUM_SQR_KERNEL_NAME(IN0_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 1), \
|
||||
HASH_GROUPNORM_SUM_SQR_KERNEL_2D_NAME(IN0_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define HASH_GROUPNORM_MEAN_VARI_KEY(_input0_type, _output_type) \
|
||||
((_input0_type << 24) | (_output_type << 16))
|
||||
|
||||
#define TENSOR_GROUPNORM_MEAN_VARI_KERNELS(SOURCE) \
|
||||
{ HASH_GROUPNORM_MEAN_VARI_KEY(F32, F32), \
|
||||
HASH_GROUPNORM_MEAN_VARI_KERNEL_NAME, \
|
||||
SOURCE },
|
||||
|
||||
// normalization
|
||||
#define HASH_GROUPNORM_KEY(_input0_type, _output_type, _reshape_flag) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
|
||||
|
||||
#define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
|
||||
HASH_GROUPNORM_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 1), \
|
||||
HASH_GROUPNORM_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _groupnorm_sum_sqr_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_GROUPNORM_SUM_SQR_KERNELS( U8, F32, KERNEL_SOURCE_1 )
|
||||
TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
|
||||
TENSOR_GROUPNORM_SUM_SQR_KERNELS( F32, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_SUM_SQR_KERNELS( I32, F32, KERNEL_SOURCE_3 )
|
||||
TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
|
||||
};
|
||||
|
||||
static const _kernel_map_type _groupnorm_mean_vari_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_2 )
|
||||
};
|
||||
|
||||
static const _kernel_map_type _groupnorm_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_GROUPNORM_KERNELS( U8, U8, KERNEL_SOURCE_1 )
|
||||
TENSOR_GROUPNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_1 )
|
||||
TENSOR_GROUPNORM_KERNELS( U8, F32, KERNEL_SOURCE_1 )
|
||||
TENSOR_GROUPNORM_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
|
||||
|
||||
TENSOR_GROUPNORM_KERNELS( F32, F32, KERNEL_SOURCE_2 )
|
||||
TENSOR_GROUPNORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
|
||||
|
||||
TENSOR_GROUPNORM_KERNELS( I32, I32, KERNEL_SOURCE_3 )
|
||||
TENSOR_GROUPNORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 )
|
||||
TENSOR_GROUPNORM_KERNELS( I32, F32, KERNEL_SOURCE_3 )
|
||||
TENSOR_GROUPNORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _GROUPNORM_SUM_SQR_PARAM_NUM _cnt_of_array( _groupnorm_sum_sqr_kernel_param_def )
|
||||
|
||||
static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _GROUPNORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _groupnorm_mean_vari_kernel_param_def )
|
||||
|
||||
static vx_param_description_t _groupnorm_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _GROUPNORM_PARAM_NUM _cnt_of_array( _groupnorm_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_int_array_t * input_shape = NULL;
|
||||
int32_t width = 0;
|
||||
int32_t chn = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
width = input_shape->data[0];
|
||||
chn = attr[1]->shape->data[1];
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.local_size[0] = 16;
|
||||
gpu_param.local_size[1] = 1;
|
||||
gpu_param.local_size[2] = 1;
|
||||
gpu_param.global_size[0] = (width + 15) / 16 * 16;
|
||||
gpu_param.global_size[1] = chn;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _group_normalization_sum_sqr_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
int32_t chn = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
|
||||
chn = attr[0]->shape->data[1];
|
||||
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.local_size[0] = 16;
|
||||
gpu_param.local_size[1] = 1;
|
||||
gpu_param.local_size[2] = 1;
|
||||
gpu_param.global_size[0] = 16;
|
||||
gpu_param.global_size[1] = chn;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _group_normalization_sum_sqr_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_int_array_t * input_shape = NULL;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
int32_t chn = 0;
|
||||
int32_t is2D = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &is2D);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
width = input_shape->data[0];
|
||||
height = input_shape->data[1];
|
||||
chn = attr[1]->shape->data[1];
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.local_size[0] = 16;
|
||||
gpu_param.local_size[1] = 1;
|
||||
gpu_param.local_size[2] = 1;
|
||||
gpu_param.global_size[0] = (width + 15) / 16 * 16;
|
||||
gpu_param.global_size[1] = height;
|
||||
gpu_param.global_size[2] = chn;
|
||||
if (is2D)
|
||||
{
|
||||
gpu_param.global_size[0] = (width + 15) / 16 * 16;
|
||||
gpu_param.global_size[1] = chn;
|
||||
gpu_param.global_size[2] = 1;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _groupnorm_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
const uint32_t hashkey,
|
||||
_internal_kernel_e kernel_id
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_kernel_initialize_f initializer = NULL;
|
||||
vx_param_description_t * param_def = NULL;
|
||||
const _kernel_map_type* kernel_map;
|
||||
size_t kernel_map_size = 0;
|
||||
size_t param_size = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
switch( kernel_id )
|
||||
{
|
||||
case INTERNAL_KERNEL_SUM_SQR:
|
||||
initializer = _groupnorm_sum_sqr_initializer;
|
||||
kernel_map = _groupnorm_sum_sqr_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _groupnorm_sum_sqr_kernel_map );
|
||||
param_def = _groupnorm_sum_sqr_kernel_param_def;
|
||||
param_size = _GROUPNORM_SUM_SQR_PARAM_NUM;
|
||||
break;
|
||||
case INTERNAL_KERNEL_MEAN_VARI:
|
||||
initializer = _groupnorm_mean_vari_initializer;
|
||||
kernel_map = _groupnorm_mean_vari_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _groupnorm_mean_vari_kernel_map );
|
||||
param_def = _groupnorm_mean_vari_kernel_param_def;
|
||||
param_size = _GROUPNORM_MEAN_VARI_PARAM_NUM;
|
||||
break;
|
||||
case INTERNAL_KERNEL_NORM:
|
||||
initializer = _groupnorm_initializer;
|
||||
kernel_map = _groupnorm_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _groupnorm_kernel_map );
|
||||
param_def = _groupnorm_kernel_param_def;
|
||||
param_size = _GROUPNORM_PARAM_NUM;
|
||||
break;
|
||||
default:
|
||||
VSI_ASSERT( FALSE );
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
for( i = 0; i < kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == hashkey )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static int32_t _optimize_gn_shape_cl
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
int32_t group_size,
|
||||
int32_t group_num,
|
||||
int32_t* opt_shape,
|
||||
int32_t* is2D_flg
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t new_rank = 0;
|
||||
group_shape[0] = inputs[0]->attr.size[0];
|
||||
group_shape[1] = inputs[0]->attr.size[1];
|
||||
group_shape[2] = group_size;
|
||||
|
||||
vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
|
||||
|
||||
if (opt_shape[1] == 1)
|
||||
{
|
||||
opt_shape[1] = group_num;
|
||||
opt_shape[2] = 1;
|
||||
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
is2D_flg[0] = 1;
|
||||
}
|
||||
else if (new_rank == 2)
|
||||
{
|
||||
opt_shape[2] = group_num;
|
||||
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
#define INTERNAL_KERNEL_SIZE (2)
|
||||
#define SUM_SQR_INDEX (0)
|
||||
#define MEAN_VARI_INDEX (1)
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t sum_sqr_node_params[_GROUPNORM_SUM_SQR_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t mean_vari_node_params[_GROUPNORM_MEAN_VARI_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t node_params[_GROUPNORM_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_dtype_e in0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e out_dtype = U8;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
|
||||
vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
|
||||
int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
|
||||
int32_t is2D_flg = 0;
|
||||
uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
|
||||
uint32_t hashkey = 0;
|
||||
int32_t i = 0;
|
||||
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
|
||||
int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" );
|
||||
int32_t group_size = inputs[0]->attr.size[2] / group_num;
|
||||
|
||||
int32_t width = inputs[0]->attr.size[0];
|
||||
int32_t height = inputs[0]->attr.size[1];
|
||||
int32_t group_stride = 1;
|
||||
float input_zp = 0;
|
||||
float input_scale = 1.0f;
|
||||
int32_t input_fl = 0;
|
||||
float output_zp = 0;
|
||||
float output_scale = 1.0f;
|
||||
int32_t output_fl = 0;
|
||||
float rSpaceOrg = 1.0f / (width * height);
|
||||
float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _optimize_gn_shape_cl(inputs, group_size, group_num, new_shape, &is2D_flg);
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
|
||||
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
|
||||
|
||||
width = new_shape[0];
|
||||
height = is2D_flg > 0 ? 1 : new_shape[1];
|
||||
group_stride = ((width + 15) / 16) * 4;
|
||||
|
||||
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
|
||||
{
|
||||
input_zp = (float)inputs[0]->attr.dtype.zero_point;
|
||||
input_scale = inputs[0]->attr.dtype.scale;
|
||||
}
|
||||
else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
|
||||
{
|
||||
input_fl = inputs[0]->attr.dtype.fl;
|
||||
if (input_fl > 0)
|
||||
{
|
||||
input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = ((float) ((int64_t)1 << -input_fl));
|
||||
}
|
||||
input_zp = 0.0f;
|
||||
}
|
||||
|
||||
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
|
||||
{
|
||||
output_zp = (float)outputs[0]->attr.dtype.zero_point;
|
||||
output_scale = 1.0f / outputs[0]->attr.dtype.scale;
|
||||
}
|
||||
else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
|
||||
{
|
||||
output_fl = outputs[0]->attr.dtype.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
output_scale = (float)((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
|
||||
}
|
||||
output_zp = 0.0f;
|
||||
}
|
||||
|
||||
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
|
||||
{
|
||||
ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
|
||||
// Assign unique_id
|
||||
ikernels[i]->unique_id = kernel->unique_id;
|
||||
}
|
||||
|
||||
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
attr.size[0] = ((new_shape[0] + 15) / 16) * 4;
|
||||
attr.size[1] = group_num;
|
||||
attr.size[2] = 1;
|
||||
attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
attr.dim_num = 4;
|
||||
tensors[SUM_SQR_INDEX] = vsi_nn_CreateTensor( graph, &attr );
|
||||
|
||||
attr.size[0] = 4;
|
||||
tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
if (in0_dtype == F16)
|
||||
{
|
||||
in0_dtype = F32;
|
||||
}
|
||||
if (out_dtype == F16)
|
||||
{
|
||||
out_dtype = F32;
|
||||
}
|
||||
|
||||
hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg );
|
||||
hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 );
|
||||
hashkey = HASH_GROUPNORM_KEY( in0_dtype, out_dtype, is2D_flg );
|
||||
|
||||
status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
// Sum Sqr
|
||||
tmp_node = vsi_nn_kernel_create_node( graph, ikernels[SUM_SQR_INDEX] );
|
||||
if (tmp_node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
sum_sqr_node_params[index++] = rs_input;
|
||||
sum_sqr_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
|
||||
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
|
||||
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
|
||||
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
|
||||
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( tmp_node, sum_sqr_node_params,
|
||||
_GROUPNORM_SUM_SQR_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[7] );
|
||||
vsi_nn_kernel_node_release( &tmp_node );
|
||||
}
|
||||
|
||||
// mean vari
|
||||
tmp_node1 = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
|
||||
if (tmp_node1)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
|
||||
mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio );
|
||||
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_stride );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( tmp_node1, mean_vari_node_params,
|
||||
_GROUPNORM_MEAN_VARI_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] );
|
||||
vsi_nn_kernel_node_release( &tmp_node1 );
|
||||
}
|
||||
|
||||
// Nomalization
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if (node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
int32_t pStride = 0;
|
||||
if (!is2D_flg)
|
||||
{
|
||||
pStride = inputs[1]->attr.size[0] / new_shape[1];
|
||||
rSpaceOrg = 1.0f / (new_shape[0] / pStride);
|
||||
}
|
||||
node_params[index++] = rs_input;
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
|
||||
node_params[index++] = rs_output;
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rSpaceOrg );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pStride );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params,
|
||||
_GROUPNORM_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[11] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[12] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[13] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[14] );
|
||||
}
|
||||
|
||||
/* Pass parameters to node. */
|
||||
final:
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_output );
|
||||
}
|
||||
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
|
||||
{
|
||||
if ( ikernels[i] )
|
||||
{
|
||||
vsi_nn_kernel_release( &ikernels[i] );
|
||||
}
|
||||
if ( tensors[i] )
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &tensors[i] );
|
||||
}
|
||||
}
|
||||
#undef INTERNAL_KERNEL_SIZE
|
||||
#undef SUM_SQR_INDEX
|
||||
#undef MEAN_VARI_INDEX
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( group_norm, _setup )
|
||||
|
||||
|
|
@ -176,19 +176,19 @@ static int32_t get_moments_output_reshape_size
|
|||
}
|
||||
sizes[3] = out_dims_num > 3 ? output_size[3] : 1;
|
||||
|
||||
if(axis_num == 1 && axis[0] == 0)
|
||||
if (axis_num == 1 && axis[0] == 0)
|
||||
{
|
||||
sizes[0] = output_size[1];
|
||||
sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
|
||||
out_rs_flg = 1;
|
||||
}
|
||||
else if(axis_num == 1 && axis[0] == 1)
|
||||
else if (axis_num == 1 && axis[0] == 1)
|
||||
{
|
||||
sizes[0] = output_size[0];
|
||||
sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
|
||||
out_rs_flg = 1;
|
||||
}
|
||||
else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1)
|
||||
else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
|
||||
{
|
||||
sizes[0] = out_dims_num > 2 ? output_size[2] : 1;
|
||||
out_rs_flg = 1;
|
||||
|
|
@ -240,25 +240,25 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
if(axis_num == 1 && axis == 0)
|
||||
if (axis_num == 1 && axis == 0)
|
||||
{
|
||||
gpu_param.global_size[0] = gpu_align_p2((height + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = chn;
|
||||
}
|
||||
else if(axis_num == 1 && axis == 1)
|
||||
else if (axis_num == 1 && axis == 1)
|
||||
{
|
||||
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = chn;
|
||||
}
|
||||
else if(axis_num == 1 && axis == 2)
|
||||
else if (axis_num == 1 && axis == 2)
|
||||
{
|
||||
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = height;
|
||||
}
|
||||
else if(axis_num == 2)
|
||||
else if (axis_num == 2)
|
||||
{
|
||||
gpu_param.local_size[0] = 16;
|
||||
gpu_param.local_size[1] = 1;
|
||||
|
|
@ -266,7 +266,7 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
gpu_param.global_size[0] = 16;
|
||||
gpu_param.global_size[1] = chn;
|
||||
}
|
||||
else if(axis_num == 3)
|
||||
else if (axis_num == 3)
|
||||
{
|
||||
gpu_param.local_size[0] = 16;
|
||||
gpu_param.local_size[1] = 1;
|
||||
|
|
@ -315,13 +315,13 @@ static vsi_status _query_kernel
|
|||
|
||||
for( i = 0; i < _cnt_of_array(moments_map); i ++ )
|
||||
{
|
||||
if( moments_map[i].key == key )
|
||||
if ( moments_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if( i < _cnt_of_array(moments_map) )
|
||||
if ( i < _cnt_of_array(moments_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", moments_map[i].function_name );
|
||||
kernel->info.parameters = _moments_kernel_param_def;
|
||||
|
|
@ -354,6 +354,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t out_shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t out_rs_flg = 0;
|
||||
int32_t axis_num = 0;
|
||||
size_t axis_num_temp = 0;
|
||||
|
|
@ -362,6 +363,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
int32_t first_axis = axis[0];
|
||||
int32_t i = 0;
|
||||
vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t reshape_tensors[3] = { NULL };
|
||||
|
||||
int32_t width = inputs[0]->attr.size[0];
|
||||
int32_t height = inputs[0]->attr.size[1];
|
||||
|
|
@ -372,7 +374,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
axis_num = (int32_t)axis_num_temp;
|
||||
|
||||
if(inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
|
||||
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
|
||||
{
|
||||
if (inputs[0]->attr.dtype.fl > 0)
|
||||
{
|
||||
|
|
@ -385,38 +387,52 @@ static vsi_nn_kernel_node_t _setup
|
|||
input_zp = 0;
|
||||
}
|
||||
|
||||
if(axis_num == 1 && axis[0] == 0)
|
||||
if (axis_num == 1 && axis[0] == 0)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(width);
|
||||
}
|
||||
else if(axis_num == 1 && axis[0] == 1)
|
||||
else if (axis_num == 1 && axis[0] == 1)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(height);
|
||||
}
|
||||
else if(axis_num == 1 && axis[0] == 2)
|
||||
else if (axis_num == 1 && axis[0] == 2)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(chn);
|
||||
}
|
||||
else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1)
|
||||
else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(width * height);
|
||||
}
|
||||
else if(axis_num == 3)
|
||||
else if (axis_num == 3)
|
||||
{
|
||||
dim_ratio = (float)1.0 / (float)(width * height * chn);
|
||||
}
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(keep_dim)
|
||||
if (keep_dim)
|
||||
{
|
||||
out_rs_flg = get_moments_output_reshape_size(&outputs[0], out_shape, axis, axis_num);
|
||||
}
|
||||
|
||||
if (inputs[0]->attr.dim_num < 2)
|
||||
{
|
||||
shape[0] = inputs[0]->attr.size[0];
|
||||
shape[1] = 1;
|
||||
reshape_tensors[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 2 );
|
||||
}
|
||||
if (outputs[0]->attr.dim_num < 2)
|
||||
{
|
||||
shape[0] = outputs[0]->attr.size[0];
|
||||
shape[1] = 1;
|
||||
reshape_tensors[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 2 );
|
||||
reshape_tensors[2] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, shape, 2 );
|
||||
}
|
||||
|
||||
scalar_list[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &first_axis );
|
||||
scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
|
||||
scalar_list[ZP] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
|
||||
|
|
@ -427,19 +443,31 @@ static vsi_nn_kernel_node_t _setup
|
|||
scalar_list[DIMRATIO] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, params, axis, axis_num, 0 );
|
||||
if( VSI_SUCCESS == status)
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if( node )
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 0;
|
||||
/* Pass parameters to node. */
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t);
|
||||
if(out_rs_flg)
|
||||
if (reshape_tensors[0])
|
||||
{
|
||||
node_params[index++] = reshape_tensors[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t);
|
||||
}
|
||||
if (out_rs_flg)
|
||||
{
|
||||
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, out_shape, 4 );
|
||||
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, out_shape, 4 );
|
||||
}
|
||||
else if (reshape_tensors[1])
|
||||
{
|
||||
node_params[index++] = reshape_tensors[1];
|
||||
node_params[index++] = reshape_tensors[2];
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[0]->t);
|
||||
|
|
@ -455,7 +483,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
node_params[index++] = scalar_list[DIMRATIO];
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
if(out_rs_flg)
|
||||
if (out_rs_flg)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &node_params[1] );
|
||||
vsi_nn_kernel_tensor_release( &node_params[2] );
|
||||
|
|
@ -465,10 +493,22 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
}
|
||||
|
||||
if (reshape_tensors[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &reshape_tensors[0] );
|
||||
}
|
||||
if (reshape_tensors[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &reshape_tensors[1] );
|
||||
}
|
||||
if (reshape_tensors[2])
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &reshape_tensors[2] );
|
||||
}
|
||||
/* Pass parameters to node. */
|
||||
for( i = 0; i < INTERNAL_MOMENTS_SCALAR_NUM; i ++ )
|
||||
{
|
||||
if(scalar_list[i])
|
||||
if (scalar_list[i])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &scalar_list[i] );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,332 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_ONE_HOT,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define _ONE_HOT_KERNEL_SOURCE "one_hot"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_ONE_HOT_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.one_hot_"#SRC_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
(( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
|
||||
|
||||
#define PACK_ONE_HOT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
|
||||
HASH_ONE_HOT_SH_KERNEL_NAME( IN_DTYPE, OUT_DTYPE ), \
|
||||
_ONE_HOT_KERNEL_SOURCE }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _one_hot_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
|
||||
PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
|
||||
PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
|
||||
PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
|
||||
PACK_ONE_HOT_KERNEL_MAP( U8, U8 ),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _one_hot_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define SCALAR_INPUT_DEPTH (2)
|
||||
#define SCALAR_INPUT_ON_VALUE (3)
|
||||
#define SCALAR_INPUT_OFF_VALUE (4)
|
||||
#define SCALAR_INPUT_SCALE (5)
|
||||
#define SCALAR_INPUT_TAIL (6)
|
||||
#define _ONE_HOT_PARAM_NUM _cnt_of_array( _one_hot_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_one_hot_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
2, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0} // globalWorkSize: image size in thread
|
||||
};
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_int_array_t * in_shape = NULL;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
in_shape = attr[0]->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(in_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = in_shape->data[1];
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(attr[0]);
|
||||
SAFE_FREE_TENSOR_ATTR(attr[1]);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
|
||||
return status;
|
||||
} /* _one_hot_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _one_hot_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _one_hot_kernel_map );
|
||||
vx_param_description_t * param_def = _one_hot_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _one_hot_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (in_dtype == F16)
|
||||
{
|
||||
in_dtype = F32;
|
||||
}
|
||||
|
||||
if (out_dtype == F16)
|
||||
{
|
||||
out_dtype = F32;
|
||||
}
|
||||
else if (out_dtype == I16 || out_dtype == I8)
|
||||
{
|
||||
out_dtype = I32;
|
||||
}
|
||||
|
||||
key = ONE_HOT_HASH_KEY( in_dtype, out_dtype );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _one_hot_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
|
||||
int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
|
||||
int32_t i = 0;
|
||||
int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
|
||||
int32_t prefix_dim_size = 1;
|
||||
int32_t suffix_dim_size = 0;
|
||||
int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
uint32_t data[2] = {0};
|
||||
float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
|
||||
float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
float inputScale = inputs[0]->attr.dtype.scale;
|
||||
float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
|
||||
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (out_dtype != F32 && out_dtype != F16)
|
||||
{
|
||||
vsi_nn_Float32ToDtype(on_value, (uint8_t*)&data[0], &outputs[0]->attr.dtype);
|
||||
vsi_nn_Float32ToDtype(off_value, (uint8_t*)&data[1], &outputs[0]->attr.dtype);
|
||||
}
|
||||
else
|
||||
{
|
||||
data[0] = *(uint32_t*)&on_value;
|
||||
data[1] = *(uint32_t*)&off_value;
|
||||
}
|
||||
|
||||
axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;
|
||||
for (i = 0; i < axis; i++)
|
||||
{
|
||||
prefix_dim_size *= inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
suffix_dim_size = num_elements / prefix_dim_size;
|
||||
|
||||
shape[0][0] = suffix_dim_size;
|
||||
shape[0][1] = prefix_dim_size;
|
||||
shape[1][0] = suffix_dim_size;
|
||||
shape[1][1] = depth;
|
||||
shape[1][2] = prefix_dim_size;
|
||||
|
||||
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], (uint32_t*)shape[0], 2 );
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], (uint32_t*)shape[1], 3 );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
|
||||
rs_tensors[1]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
|
||||
&rs_tensors[0], input_num, &rs_tensors[1], output_num );
|
||||
node_params[SCALAR_INPUT_DEPTH] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &depth );
|
||||
node_params[SCALAR_INPUT_ON_VALUE] = vsi_nn_kernel_scalar_create(
|
||||
graph, U32, &data[0] );
|
||||
node_params[SCALAR_INPUT_OFF_VALUE] = vsi_nn_kernel_scalar_create(
|
||||
graph, U32, &data[1] );
|
||||
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &inputScale );
|
||||
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &inputTail );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
if (rs_tensors[0])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[0] );
|
||||
}
|
||||
|
||||
if (rs_tensors[1])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[1] );
|
||||
}
|
||||
|
||||
for (i = SCALAR_INPUT_DEPTH; i < _ONE_HOT_PARAM_NUM; i++)
|
||||
{
|
||||
if (node_params[i])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( one_hot, _setup )
|
||||
|
|
@ -178,11 +178,19 @@ static vsi_status _query_kernel
|
|||
{
|
||||
in_dtype = F32;
|
||||
}
|
||||
else if (I16 == in_dtype && inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
|
||||
{
|
||||
in_dtype = I32;
|
||||
}
|
||||
|
||||
if (F16 == out_dtype)
|
||||
{
|
||||
out_dtype = F32;
|
||||
}
|
||||
else if (I16 == out_dtype && outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
|
||||
{
|
||||
out_dtype = I32;
|
||||
}
|
||||
|
||||
key = HASH_REDUCEMAX_HASH_KEY( axis, in_dtype, out_dtype, image_2d );
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,407 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define KERNEL_SOURCE_1 "repeat"
|
||||
|
||||
// Add kernel hashtable here
|
||||
|
||||
#define HASH_REPEAT_KERNEL_NAME(SRC0_TYPE, AXIS) \
|
||||
CVIVANTE_NAMESPACE("cl.repeat_"#SRC0_TYPE"_axis"#AXIS)
|
||||
|
||||
#define HASH_REPEAT_KERNEL_1D_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.repeat_"#SRC0_TYPE"_1D")
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_REPEAT_KEY(_input0_type, _output_type, _is1d, _axis) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_is1d << 8) | _axis)
|
||||
|
||||
#define TENSOR_REPEAT_KERNELS(IN0_TYPE, OUT_TYPE, AXIS, SOURCE) \
|
||||
{ HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 0, AXIS), \
|
||||
HASH_REPEAT_KERNEL_NAME(IN0_TYPE, AXIS), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_REPEAT_1D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 1, 0), \
|
||||
HASH_REPEAT_KERNEL_1D_NAME(IN0_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _repeat_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_REPEAT_KERNELS( I32, I32, 0, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_KERNELS( I32, I32, 1, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_KERNELS( I32, I32, 2, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_KERNELS( F32, F32, 0, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_KERNELS( F32, F32, 1, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_KERNELS( F32, F32, 2, KERNEL_SOURCE_1 )
|
||||
|
||||
TENSOR_REPEAT_1D_KERNELS( I32, I32, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_1D_KERNELS( F32, F32, KERNEL_SOURCE_1 )
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _repeat_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _REPEAT_PARAM_NUM _cnt_of_array( _repeat_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_repeat_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_int_array_t * input_shape = NULL;
|
||||
int32_t height = 0, width = 0, chn = 0;
|
||||
int32_t is1d = 0;
|
||||
int32_t axis = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
width = input_shape->data[0];
|
||||
height = input_shape->data[1];
|
||||
if (height == 1 && input_shape->size == 2)
|
||||
{
|
||||
is1d = 1;
|
||||
}
|
||||
chn = input_shape->size > 2 ? input_shape->data[2] : 1;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = width;
|
||||
gpu_param.global_size[1] = height;
|
||||
gpu_param.global_size[2] = chn;
|
||||
if (is1d || axis == 1)
|
||||
{
|
||||
gpu_param.global_size[0] = 1;
|
||||
}
|
||||
else if (axis == 0)
|
||||
{
|
||||
gpu_param.global_size[1] = 1;
|
||||
}
|
||||
else if (axis == 2)
|
||||
{
|
||||
gpu_param.global_size[2] = 1;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _repeat_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t axis
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
|
||||
uint32_t key = 0;
|
||||
int i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (input0_dtype == F16)
|
||||
{
|
||||
input0_dtype = F32;
|
||||
}
|
||||
if (output_dtype == F16)
|
||||
{
|
||||
output_dtype = F32;
|
||||
}
|
||||
|
||||
key = HASH_REPEAT_KEY( input0_dtype, output_dtype, is1d, axis );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(_repeat_kernel_map); i ++ )
|
||||
{
|
||||
if ( _repeat_kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < _cnt_of_array(_repeat_kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _repeat_kernel_map[i].function_name );
|
||||
kernel->info.parameters = _repeat_kernel_param_def;
|
||||
kernel->info.numParams = _REPEAT_PARAM_NUM;
|
||||
kernel->info.initialize = _repeat_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
_repeat_kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
_repeat_kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static int32_t _optimize_repeat_shape
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
int32_t* axis,
|
||||
int32_t* opt_shape_in,
|
||||
int32_t* opt_shape_out,
|
||||
int32_t* new_rank
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
|
||||
if (inputs[0]->attr.dim_num == 1)
|
||||
{
|
||||
opt_shape_in[0] = inputs[0]->attr.size[0];
|
||||
opt_shape_in[1] = 1;
|
||||
opt_shape_out[0] = outputs[0]->attr.size[0];
|
||||
opt_shape_out[1] = 1;
|
||||
new_rank[0] = 2;
|
||||
new_rank[1] = 2;
|
||||
}
|
||||
else if (axis[0] == 3)
|
||||
{
|
||||
vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank );
|
||||
if (opt_shape_in[1] == 1)
|
||||
{
|
||||
opt_shape_in[1] = inputs[0]->attr.size[3];
|
||||
opt_shape_out[0] = opt_shape_in[0];
|
||||
opt_shape_out[1] = outputs[0]->attr.size[3];
|
||||
axis[0] = 0;
|
||||
new_rank[0] = 2;
|
||||
new_rank[1] = 2;
|
||||
}
|
||||
else if (new_rank[0] == 2)
|
||||
{
|
||||
opt_shape_in[2] = inputs[0]->attr.size[3];
|
||||
opt_shape_out[0] = opt_shape_in[0];
|
||||
opt_shape_out[1] = opt_shape_in[1];
|
||||
opt_shape_out[2] = outputs[0]->attr.size[3];
|
||||
axis[0] = 2;
|
||||
new_rank[0] = 3;
|
||||
new_rank[1] = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_REPEAT_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL;
|
||||
int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
|
||||
int32_t new_rank[2] = {0, 0};
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
|
||||
int32_t width = inputs[0]->attr.size[0];
|
||||
int32_t height = inputs[0]->attr.dim_num > 1 ? inputs[0]->attr.size[1] : 1;
|
||||
int32_t channel = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (axis > 2 || outputs[0]->attr.dim_num == 1)
|
||||
{
|
||||
status = _optimize_repeat_shape(inputs, outputs, &axis, new_shape[0], new_shape[1], new_rank);
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], new_rank[0]);
|
||||
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], new_rank[1]);
|
||||
|
||||
width = new_shape[0][0];
|
||||
height = new_shape[0][1];
|
||||
channel = new_rank[0] > 2 ? new_shape[0][2]: 1;
|
||||
}
|
||||
|
||||
if (inputs[1]->attr.dim_num == 1)
|
||||
{
|
||||
new_shape[0][0] = inputs[1]->attr.size[0];
|
||||
new_shape[0][1] = 1;
|
||||
rs_input1 = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[0], 2);
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, axis );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if (node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
if (rs_input)
|
||||
{
|
||||
node_params[index++] = rs_input;
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
|
||||
}
|
||||
if (rs_input1)
|
||||
{
|
||||
node_params[index++] = rs_input1;
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
node_params[index++] = rs_output;
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
|
||||
}
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &channel );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params,
|
||||
_REPEAT_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
}
|
||||
|
||||
/* Pass parameters to node. */
|
||||
final:
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
}
|
||||
if (rs_input1)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input1 );
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_output );
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( repeat, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,354 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "math.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define KERNEL_SOURCE_1 "sequence_mask"
|
||||
|
||||
#define HASH_SEQUENCE_MASK_KEY(_input0_type, _output_type, _image_2d) \
|
||||
((_input0_type << 24) | (_output_type << 8) | (_image_2d))
|
||||
|
||||
#define HASH_SEQUENCE_MASK_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.sequence_mask_"#SRC0_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define HASH_SEQUENCE_MASK_SH_2DKERNEL_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.sequence_mask_"#SRC0_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
#define TENSOR_SEQUENCE_MASK_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 0), \
|
||||
HASH_SEQUENCE_MASK_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_SEQUENCE_MASK_2D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 1), \
|
||||
HASH_SEQUENCE_MASK_SH_2DKERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} kernel_map[] =
|
||||
{
|
||||
TENSOR_SEQUENCE_MASK_KERNELS(I32, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SEQUENCE_MASK_KERNELS(I32, I32, KERNEL_SOURCE_1)
|
||||
TENSOR_SEQUENCE_MASK_KERNELS(I32, F32, KERNEL_SOURCE_1)
|
||||
TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, I32, KERNEL_SOURCE_1)
|
||||
TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, F32, KERNEL_SOURCE_1)
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def)
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
vsi_int_array_t * out_shape = NULL;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[0]->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1];
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _sequence_mask_initializer() */
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel,
|
||||
int32_t is2Dflg
|
||||
)
|
||||
{
|
||||
vsi_nn_kernel_dtype_e input0_dtype = I32;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
uint32_t key = 0;
|
||||
int i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
if (output_dtype == BOOL8)
|
||||
{
|
||||
output_dtype= U8;
|
||||
}
|
||||
|
||||
key = HASH_SEQUENCE_MASK_KEY( input0_dtype, output_dtype, is2Dflg );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( kernel_param_def );
|
||||
kernel->info.initialize = _sequence_mask_initializer;
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static int32_t _optimize_mask_shape
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
int32_t max_len,
|
||||
int32_t* opt_shape_in,
|
||||
int32_t* opt_shape_out,
|
||||
int32_t* is2Dflg
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
int32_t in_shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t new_rank = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
for(i = 0; i < inputs[0]->attr.dim_num; i++)
|
||||
{
|
||||
in_shape[i] = inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
vsi_nn_kernel_optimize_element_shape( in_shape, inputs[0]->attr.dim_num, opt_shape_in, &new_rank );
|
||||
if (new_rank > 2)
|
||||
{
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
opt_shape_out[0] = max_len;
|
||||
for(i = 0; i < (uint32_t)new_rank; i++)
|
||||
{
|
||||
opt_shape_out[i + 1] = opt_shape_in[i];
|
||||
}
|
||||
if (opt_shape_out[2] == 1)
|
||||
{
|
||||
is2Dflg[0] = 1;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
|
||||
int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
|
||||
int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" );
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t is2Dflg = 0;
|
||||
float input_zp = 0;
|
||||
float input_scale = 1.0f;
|
||||
int32_t output_zp = 0;
|
||||
float output_scale = 1.0f;
|
||||
float input_zpScale = 0;
|
||||
float outputVal1 = 1.0f;
|
||||
int32_t input_fl = 0;
|
||||
int32_t output_fl = 0;
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1], &is2Dflg);
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2);
|
||||
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 4);
|
||||
|
||||
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
|
||||
{
|
||||
input_zp = (float)inputs[0]->attr.dtype.zero_point;
|
||||
input_scale = inputs[0]->attr.dtype.scale;
|
||||
}
|
||||
else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
|
||||
{
|
||||
input_fl = inputs[0]->attr.dtype.fl;
|
||||
if (input_fl > 0)
|
||||
{
|
||||
input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = ((float) ((int64_t)1 << -input_fl));
|
||||
}
|
||||
input_zp = 0.0f;
|
||||
}
|
||||
|
||||
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
|
||||
{
|
||||
output_zp = outputs[0]->attr.dtype.zero_point;
|
||||
output_scale = 1.0f / outputs[0]->attr.dtype.scale;
|
||||
}
|
||||
else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
|
||||
{
|
||||
output_fl = outputs[0]->attr.dtype.fl;
|
||||
if (output_fl > 0)
|
||||
{
|
||||
output_scale = (float)((int64_t)1 << output_fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
|
||||
}
|
||||
output_zp = 0;
|
||||
}
|
||||
input_zpScale = input_scale * input_zp;
|
||||
outputVal1 = output_scale + (float)output_zp;
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, is2Dflg );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 0;
|
||||
node_params[index++] = rs_input;
|
||||
node_params[index++] = rs_output;
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zpScale );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputVal1 );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
vsi_nn_kernel_scalar_release( &node_params[2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_output );
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( sequence_mask, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,308 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_SLICE,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define _SLICE_KERNEL_SOURCE "slice"
|
||||
#define SLICE_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE)
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D) \
|
||||
(( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D))
|
||||
|
||||
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
|
||||
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
|
||||
SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
|
||||
|
||||
#define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D")
|
||||
|
||||
#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
|
||||
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
|
||||
SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _slice_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( F32, I32, F32, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP( I32, I32, I32, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
|
||||
|
||||
PACK_KERNEL_MAP_2D( F32, I32, F32, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_2D( I32, I32, I32, _SLICE_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
|
||||
};
|
||||
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _slice_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _SLICE_PARAM_NUM _cnt_of_array( _slice_kernel_param_def )
|
||||
#define SCALAR_INPUT_SCALE (3)
|
||||
#define SCALAR_INPUT_TAIL (4)
|
||||
#define SCALAR_OUTPUT_SCALE (5)
|
||||
#define SCALAR_OUTPUT_ZP (6)
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_slice_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||
vsi_int_array_t * out_shape = NULL;
|
||||
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = output_attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(output_attr);
|
||||
return status;
|
||||
} /* _slice_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
vsi_bool image_2d
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _slice_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _slice_kernel_map );
|
||||
vx_param_description_t * param_def = _slice_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _slice_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _slice_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (F16 == in0_dtype)
|
||||
{
|
||||
in0_dtype = F32;
|
||||
}
|
||||
|
||||
if (F16 == out_dtype)
|
||||
{
|
||||
out_dtype = F32;
|
||||
}
|
||||
|
||||
key = SLICE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_bool image_2d = FALSE;
|
||||
uint32_t rank[_IO_NUM] = {0};
|
||||
int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
|
||||
vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
|
||||
int32_t i = 0;
|
||||
int32_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
int32_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
|
||||
float inputScale = inputs[0]->attr.dtype.scale;
|
||||
float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
|
||||
float outputScale = outputs[0]->attr.dtype.scale;
|
||||
float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
|
||||
|
||||
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
|
||||
|
||||
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
shapes[0], &rank[0]);
|
||||
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
|
||||
shapes[1], &rank[1]);
|
||||
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num,
|
||||
shapes[2], &rank[2]);
|
||||
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[i], (uint32_t*)shapes[i], rank[i] );
|
||||
}
|
||||
reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
|
||||
inputs[0]->attr.dim_num ) || input_batch != output_batch )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = (rank[0] < 3 || shapes[0][2] == 1);
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, image_2d );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM,
|
||||
reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
|
||||
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &inputScale );
|
||||
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &inputTail );
|
||||
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &outputScale );
|
||||
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &outputZP );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( slice, _setup )
|
||||
|
|
@ -36,7 +36,7 @@
|
|||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define _CPU_ARG_NUM (1)
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@
|
|||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,279 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (4)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.axis_aligned_bbox_transform")
|
||||
|
||||
typedef struct vsi_nn_box_encoding_corner_t
|
||||
{
|
||||
float x1, y1, x2, y2;
|
||||
}vsi_nn_box_encoding_corner;
|
||||
|
||||
typedef struct vsi_nn_box_encoding_center_t
|
||||
{
|
||||
float w, h, x, y;
|
||||
}vsi_nn_box_encoding_center;
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _axis_aligned_bbox_transform_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def )
|
||||
|
||||
|
||||
static void _to_box_encoding_corner
|
||||
(
|
||||
vsi_nn_box_encoding_center* ctr,
|
||||
vsi_nn_box_encoding_corner* cnr
|
||||
)
|
||||
{
|
||||
cnr->x1 = ctr->x - ctr->w / 2;
|
||||
cnr->y1 = ctr->y - ctr->h / 2;
|
||||
cnr->x2 = ctr->x + ctr->w / 2;
|
||||
cnr->y2 = ctr->y + ctr->h / 2;
|
||||
}
|
||||
|
||||
static void _to_box_encoding_center
|
||||
(
|
||||
vsi_nn_box_encoding_corner* cnr,
|
||||
vsi_nn_box_encoding_center* ctr
|
||||
)
|
||||
{
|
||||
ctr->w = cnr->x2 - cnr->x1;
|
||||
ctr->h = cnr->y2 - cnr->y1;
|
||||
ctr->x = (cnr->x1 + cnr->x2) / 2;
|
||||
ctr->y = (cnr->y1 + cnr->y2) / 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
uint32_t i;
|
||||
const uint32_t roiLength = 4;
|
||||
const uint32_t imageLength = 2;
|
||||
uint32_t numClasses = 0;
|
||||
uint32_t numRois = 0;
|
||||
uint32_t j;
|
||||
uint32_t roiIndex;
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
|
||||
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
|
||||
numClasses = in_attr[1]->shape->data[0] / roiLength;
|
||||
numRois = in_attr[0]->shape->data[1];
|
||||
|
||||
for (roiIndex = 0; roiIndex < numRois; roiIndex++)
|
||||
{
|
||||
uint32_t batchIndex = (uint32_t)f32_in_buffer[2][roiIndex];
|
||||
float imageHeight = f32_in_buffer[3][batchIndex * imageLength];
|
||||
float imageWidth = f32_in_buffer[3][batchIndex * imageLength + 1];
|
||||
vsi_nn_box_encoding_corner roi_cnr;
|
||||
vsi_nn_box_encoding_center roiBefore;
|
||||
roi_cnr.x1 = f32_in_buffer[0][roiIndex * roiLength];
|
||||
roi_cnr.y1 = f32_in_buffer[0][roiIndex * roiLength + 1];
|
||||
roi_cnr.x2 = f32_in_buffer[0][roiIndex * roiLength + 2];
|
||||
roi_cnr.y2 = f32_in_buffer[0][roiIndex * roiLength + 3];
|
||||
_to_box_encoding_center(&roi_cnr, &roiBefore);
|
||||
|
||||
for (j = 0; j < numClasses; j++)
|
||||
{
|
||||
vsi_nn_box_encoding_center roi_ctr;
|
||||
vsi_nn_box_encoding_corner roiAfter;
|
||||
vsi_nn_box_encoding_corner cliped;
|
||||
uint32_t index = (roiIndex * numClasses + j) * roiLength;
|
||||
|
||||
roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w);
|
||||
roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h);
|
||||
roi_ctr.x = roiBefore.x + f32_in_buffer[1][index] * roiBefore.w;
|
||||
roi_ctr.y = roiBefore.y + f32_in_buffer[1][index + 1] * roiBefore.h;
|
||||
_to_box_encoding_corner(&roi_ctr, &roiAfter);
|
||||
|
||||
cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth);
|
||||
cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight);
|
||||
cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth);
|
||||
cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight);
|
||||
f32_out_buffer[0][index] = cliped.x1;
|
||||
f32_out_buffer[0][index + 1] = cliped.y1;
|
||||
f32_out_buffer[0][index + 2] = cliped.x2;
|
||||
f32_out_buffer[0][index + 3] = cliped.y2;
|
||||
}
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
if (f32_in_buffer[i])
|
||||
{
|
||||
free(f32_in_buffer[i]);
|
||||
f32_in_buffer[i] = NULL;
|
||||
}
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (f32_out_buffer[i])
|
||||
{
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _axis_aligned_bbox_transform_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
|
||||
if( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( axis_aligned_bbox_transform, _setup )
|
||||
|
||||
|
|
@ -34,7 +34,7 @@
|
|||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -164,8 +164,8 @@ DEF_KERNEL_EXECUTOR(_comparisons_exec)
|
|||
buffer[2][i] = (float)data;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
|
||||
buffer[1], out_elements );
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
|
||||
buffer[2], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,264 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.conv1d_ovxlib")
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _conv1d_ovxlib_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _CONV1D_OVXLIB_PARAM_NUM _cnt_of_array( _conv1d_ovxlib_kernel_param_def )
|
||||
#define _IO_COUNT (4)
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
int i = 0;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t tensors[_IO_COUNT] = { NULL };
|
||||
vsi_nn_kernel_tensor_attr_t* attr[_IO_COUNT] = { NULL };
|
||||
float* buffer[_IO_COUNT] = { NULL };
|
||||
int32_t stride = 0;
|
||||
int32_t pad_front = 0;
|
||||
int32_t pad_end = 0;
|
||||
int32_t dilation = 0;
|
||||
int32_t overflow_policy = 0;
|
||||
int32_t rounding_policy = 0;
|
||||
int32_t down_scale_size_rounding = 0;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
|
||||
tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
|
||||
attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
|
||||
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
|
||||
buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
|
||||
buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[3], "Create input buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &stride);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &pad_front);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &pad_end);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &overflow_policy);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rounding_policy);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &down_scale_size_rounding);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
{
|
||||
int32_t batch = attr[0]->shape->data[2];
|
||||
int32_t input_channel = attr[0]->shape->data[1];
|
||||
int32_t input_height = attr[0]->shape->data[0];
|
||||
int32_t kernel_size = attr[1]->shape->data[0];
|
||||
int32_t output_channel = attr[1]->shape->data[2];
|
||||
int32_t output_height = attr[3]->shape->data[0];
|
||||
int32_t batch_index = 0;
|
||||
int32_t input_channel_index = 0;
|
||||
int32_t output_channel_index = 0;
|
||||
int32_t output_h_index = 0;
|
||||
|
||||
for(batch_index = 0; batch_index < batch; batch_index++)
|
||||
{
|
||||
float* per_batch_input = buffer[0] + batch_index * input_channel * input_height;
|
||||
float* per_batch_output = buffer[3] + batch_index * output_channel * output_height;
|
||||
for(output_channel_index = 0; output_channel_index < output_channel; output_channel_index++)
|
||||
{
|
||||
float* filter = buffer[1] + output_channel_index * input_channel * kernel_size;
|
||||
for(output_h_index = 0; output_h_index < output_height; output_h_index++)
|
||||
{
|
||||
float output_value = 0.;
|
||||
float* current_value_ptr = per_batch_input + output_h_index * stride;
|
||||
|
||||
for(input_channel_index = 0; input_channel_index < input_channel; input_channel_index++)
|
||||
{
|
||||
int k = 0;
|
||||
int32_t index = 0;
|
||||
for(k = 0; k < kernel_size; k++)
|
||||
{
|
||||
float w = *(filter + input_channel_index * kernel_size + k);
|
||||
float v = *(current_value_ptr + input_channel_index * input_height + index);
|
||||
|
||||
output_value += w * v;
|
||||
index += dilation;
|
||||
}
|
||||
}
|
||||
|
||||
if(buffer[2])
|
||||
{
|
||||
output_value += buffer[2][output_channel_index];
|
||||
}
|
||||
|
||||
*(per_batch_output + output_channel_index * output_height + output_h_index) = output_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
|
||||
buffer[3], batch * output_channel * output_height );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
for( i = 0; i < _IO_COUNT; i ++ )
|
||||
{
|
||||
if( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[i] );
|
||||
}
|
||||
|
||||
return status;
|
||||
|
||||
} /* _compute() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _conv1d_ovxlib_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _conv1d_ovxlib_kernel_param_def );
|
||||
|
||||
return VSI_SUCCESS;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_CONV1D_OVXLIB_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int j = 0;
|
||||
|
||||
int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
|
||||
int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" );
|
||||
int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
|
||||
int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" );
|
||||
int32_t overflow_policy = vsi_nn_kernel_param_get_int32( params, "overflow_policy" );
|
||||
int32_t rounding_policy = vsi_nn_kernel_param_get_int32( params, "rounding_policy" );
|
||||
int32_t down_scale_size_rounding = vsi_nn_kernel_param_get_int32( params, "down_scale_size_rounding" );
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
|
||||
if( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CONV1D_OVXLIB_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
j = (int)(input_num + output_num);
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &stride );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_front );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_end );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &dilation );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &overflow_policy );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &rounding_policy );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &down_scale_size_rounding );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CONV1D_OVXLIB_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( conv1d_ovxlib, _setup )
|
||||
|
|
@ -35,7 +35,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -46,6 +46,7 @@ typedef enum
|
|||
UNARY_NEG,
|
||||
UNARY_HSIGMOID,
|
||||
UNARY_MISH,
|
||||
UNARY_ROUND,
|
||||
} unary_type_e;
|
||||
|
||||
|
||||
|
|
@ -101,6 +102,13 @@ static float mish_eval(float data)
|
|||
return data;
|
||||
}
|
||||
|
||||
static float round_eval(float data)
|
||||
{
|
||||
data = (float)(vsi_rtne(data));
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
|
|
@ -165,6 +173,9 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
|
|||
case UNARY_MISH:
|
||||
data = mish_eval(data);
|
||||
break;
|
||||
case UNARY_ROUND:
|
||||
data = round_eval(data);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -298,3 +309,4 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu, UNARY_ELU )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg, UNARY_NEG )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish, UNARY_MISH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND )
|
||||
|
|
@ -0,0 +1,229 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.erf")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _erf_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _ERF_PARAM_NUM _cnt_of_array( _erf_kernel_param_def )
|
||||
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
size_t i = 0;
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
|
||||
}
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
#define ERF_PI 3.141592653589793
|
||||
for (i = 0; i < out_elements[0]; i ++)
|
||||
{
|
||||
/* 2 / sqrt(pi) * (sum[(-1)^n! * x ^ (2n + 1)] + x) */
|
||||
float x = f32_in_buffer[0][i];
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1; /*n!*/
|
||||
float x_pow = x;
|
||||
int32_t one = 1;
|
||||
int32_t n = 1;
|
||||
|
||||
while (vsi_abs(tmp) > 1e-5)
|
||||
{
|
||||
res += tmp;
|
||||
|
||||
factorial *= n;
|
||||
one *= -1;
|
||||
x_pow *= x * x;
|
||||
tmp = one / factorial * x_pow / ( 2 * n + 1);
|
||||
|
||||
n ++;
|
||||
}
|
||||
|
||||
|
||||
res *= 2.0f / (float)sqrt(ERF_PI);
|
||||
|
||||
f32_out_buffer[0][i] = res;
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
if (f32_in_buffer[i])
|
||||
{
|
||||
free(f32_in_buffer[i]);
|
||||
f32_in_buffer[i] = NULL;
|
||||
}
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (f32_out_buffer[i])
|
||||
{
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _erf_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _erf_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( erf, _setup )
|
||||
|
|
@ -35,7 +35,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,315 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _CPU_ARG_NUM (2)
|
||||
#define _CPU_INPUT_NUM (3)
|
||||
#define _CPU_OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.group_norm")
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_group_norm_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
|
||||
float * buffer[_CPU_IO_NUM] = { NULL };
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
|
||||
uint32_t i = 0;
|
||||
int32_t spaceOrg = 0;
|
||||
float eps = .0f;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
|
||||
tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &spaceOrg);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
|
||||
|
||||
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
|
||||
|
||||
buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final );
|
||||
|
||||
buffer[3] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
|
||||
memset( buffer[3], 0, out_elements * sizeof(float) );
|
||||
|
||||
{
|
||||
uint32_t b = 0, c = 0;
|
||||
uint32_t height = attr[0]->shape->data[1];
|
||||
uint32_t width = attr[0]->shape->data[0];
|
||||
uint32_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
|
||||
uint32_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
|
||||
uint32_t spatial = height * width;
|
||||
|
||||
for (b = 0; b < bh; b++)
|
||||
{
|
||||
for (c = 0; c < ch; c++)
|
||||
{
|
||||
uint32_t page = c * spatial + b * (spatial * ch);
|
||||
uint32_t paraIdx = c * attr[1]->shape->data[0];
|
||||
float sum = .0f;
|
||||
float sumsq = .0f;
|
||||
float mean = .0f;
|
||||
float vari = .0f;
|
||||
float data = 0;
|
||||
|
||||
for (i = 0; i < spatial; i++)
|
||||
{
|
||||
uint32_t index = page + i;
|
||||
sum += buffer[0][index];
|
||||
}
|
||||
|
||||
mean = sum / spatial;
|
||||
for (i = 0; i < spatial; i++)
|
||||
{
|
||||
uint32_t index = page + i;
|
||||
data = buffer[0][index] - mean;
|
||||
sumsq += data * data;
|
||||
}
|
||||
|
||||
vari = sumsq / spatial;
|
||||
vari = (float)(1.0 / sqrtf(vari + eps));
|
||||
|
||||
for (i = 0; i < spatial; i++)
|
||||
{
|
||||
float normVal = 0;
|
||||
uint32_t index = page + i;
|
||||
uint32_t tmpIdx = paraIdx + i / spaceOrg;
|
||||
float scaleVal = buffer[2][tmpIdx];
|
||||
float biasVal = buffer[1][tmpIdx];
|
||||
|
||||
data = buffer[0][index] - mean;
|
||||
normVal = data * vari * scaleVal + biasVal;
|
||||
buffer[3][index] = normVal;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
|
||||
buffer[3], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
for( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if ( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
}
|
||||
for( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
|
||||
}
|
||||
return status;
|
||||
} /* _group_norm_exec() */
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _group_normalization_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _GROUP_NORMALIZATION_PARAM_NUM _cnt_of_array( _group_normalization_kernel_param_def )
|
||||
|
||||
static const vx_kernel_description_t _kernel_info =
|
||||
{
|
||||
KERNEL_ID_PLACEHOLDER,
|
||||
_KERNEL_NAME,
|
||||
_group_norm_exec,
|
||||
_group_normalization_kernel_param_def,
|
||||
_cnt_of_array( _group_normalization_kernel_param_def ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
|
||||
return VSI_SUCCESS;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static int32_t _optimize_gn_shape_cpu
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
int32_t group_size,
|
||||
int32_t group_num,
|
||||
int32_t* opt_shape
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
int32_t new_rank = 0;
|
||||
group_shape[0] = inputs[0]->attr.size[0];
|
||||
group_shape[1] = inputs[0]->attr.size[1];
|
||||
group_shape[2] = group_size;
|
||||
|
||||
vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
|
||||
|
||||
if (new_rank == 2)
|
||||
{
|
||||
opt_shape[2] = group_num;
|
||||
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
|
||||
int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
|
||||
int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" );
|
||||
int32_t group_size = inputs[0]->attr.size[2] / group_num;
|
||||
int32_t spaceOrg = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
|
||||
|
||||
status = _optimize_gn_shape_cpu(inputs, group_size, group_num, new_shape);
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
|
||||
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
|
||||
uint32_t index = 0;
|
||||
/* Set inputs and outputs */
|
||||
backend_params[index++] = rs_input;
|
||||
backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
|
||||
backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
|
||||
backend_params[index++] = rs_output;
|
||||
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
|
||||
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &spaceOrg );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
||||
CHECK_STATUS( status );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[5] );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
final:
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_output );
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( group_norm, _setup )
|
||||
|
||||
|
|
@ -35,7 +35,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -143,8 +143,8 @@ DEF_KERNEL_EXECUTOR(_layer_norm_exec)
|
|||
{
|
||||
int idx = (outer * axisSize + i) * innerSize + inner;
|
||||
float data = buffer[0][idx] - mean;
|
||||
float scaleVal = buffer[2][idx];
|
||||
float biasVal = buffer[1][idx];
|
||||
float scaleVal = buffer[2][i];
|
||||
float biasVal = buffer[1][i];
|
||||
float normVal = data * vari * scaleVal + biasVal;
|
||||
buffer[3][idx] = normVal;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@
|
|||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define _CPU_ARG_NUM (2)
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@
|
|||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,441 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (3)
|
||||
#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.nms")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _nms_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define SCALAR_INPUT_MAX_SIZE (5)
|
||||
#define SCALAR_INPUT_IOU_THRES (6)
|
||||
#define SCALAR_INPUT_SCORE_THRES (7)
|
||||
#define SCALAR_INPUT_SOFT_NMS_SIGMA (8)
|
||||
#define _NMS_PARAM_NUM _cnt_of_array( _nms_kernel_param_def )
|
||||
|
||||
typedef struct Candidate_s
|
||||
{
|
||||
int index;
|
||||
float score;
|
||||
int suppress_begin_index;
|
||||
}Candidate;
|
||||
static void _swap_element
|
||||
(
|
||||
Candidate* list,
|
||||
uint32_t first,
|
||||
uint32_t second
|
||||
)
|
||||
{
|
||||
Candidate temp;
|
||||
memcpy(&temp, &list[first], sizeof(Candidate));
|
||||
memcpy(&list[first], &list[second], sizeof(Candidate));
|
||||
memcpy(&list[second], &temp, sizeof(Candidate));
|
||||
}
|
||||
|
||||
static uint32_t _max_element
|
||||
(
|
||||
Candidate* list,
|
||||
uint32_t len
|
||||
)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t max_index = 0;
|
||||
float max_val = list[0].score;
|
||||
for ( i = 1; i < len; i++ )
|
||||
{
|
||||
float val = list[i].score;
|
||||
if ( max_val < val )
|
||||
{
|
||||
max_val = val;
|
||||
max_index = i;
|
||||
}
|
||||
}
|
||||
|
||||
return max_index;
|
||||
}
|
||||
|
||||
typedef struct box_corner_encoding_s
|
||||
{
|
||||
float y1;
|
||||
float x1;
|
||||
float y2;
|
||||
float x2;
|
||||
}box_corner_encoding;
|
||||
|
||||
static float _computeIntersectionOverUnion
|
||||
(
|
||||
const float* boxes,
|
||||
const int32_t i,
|
||||
const int32_t j
|
||||
)
|
||||
{
|
||||
box_corner_encoding box_i = ((box_corner_encoding *)boxes)[i];
|
||||
box_corner_encoding box_j = ((box_corner_encoding *)boxes)[j];
|
||||
const float box_i_y_min = vsi_nn_min(box_i.y1, box_i.y2);
|
||||
const float box_i_y_max = vsi_nn_max(box_i.y1, box_i.y2);
|
||||
const float box_i_x_min = vsi_nn_min(box_i.x1, box_i.x2);
|
||||
const float box_i_x_max = vsi_nn_max(box_i.x1, box_i.x2);
|
||||
const float box_j_y_min = vsi_nn_min(box_j.y1, box_j.y2);
|
||||
const float box_j_y_max = vsi_nn_max(box_j.y1, box_j.y2);
|
||||
const float box_j_x_min = vsi_nn_min(box_j.x1, box_j.x2);
|
||||
const float box_j_x_max = vsi_nn_max(box_j.x1, box_j.x2);
|
||||
|
||||
const float area_i =
|
||||
(box_i_y_max - box_i_y_min) * (box_i_x_max - box_i_x_min);
|
||||
const float area_j =
|
||||
(box_j_y_max - box_j_y_min) * (box_j_x_max - box_j_x_min);
|
||||
const float intersection_ymax = vsi_nn_min(box_i_y_max, box_j_y_max);
|
||||
const float intersection_xmax = vsi_nn_min(box_i_x_max, box_j_x_max);
|
||||
const float intersection_ymin = vsi_nn_max(box_i_y_min, box_j_y_min);
|
||||
const float intersection_xmin = vsi_nn_max(box_i_x_min, box_j_x_min);
|
||||
const float intersection_area =
|
||||
vsi_nn_max(intersection_ymax - intersection_ymin, 0.0f) *
|
||||
vsi_nn_max(intersection_xmax - intersection_xmin, 0.0f);
|
||||
|
||||
if (area_i <= 0 || area_j <= 0)
|
||||
{
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
return intersection_area / (area_i + area_j - intersection_area);
|
||||
}
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_SUCCESS;
|
||||
vsi_nn_kernel_tensor_t tensors[_INPUT_NUM] = { NULL };
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float * buffer[_INPUT_NUM] = { NULL };
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
size_t stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM] = { NULL };
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
|
||||
int32_t i = 0;
|
||||
int32_t num_boxes = 0;
|
||||
float* boxes = NULL;
|
||||
float* scores = NULL;
|
||||
float* selected_indices = NULL;
|
||||
float* selected_scores = NULL;
|
||||
float* num_selected_indices = NULL;
|
||||
Candidate * candidate = NULL;
|
||||
int32_t select_size = 0;
|
||||
int32_t max_output_size = 0;
|
||||
int32_t select_start = 0;
|
||||
int32_t select_len = 0;
|
||||
float iou_threshold = 0.f;
|
||||
float score_threshold = 0.f;
|
||||
float soft_nms_sigma = 0.f;
|
||||
float scale = 0;
|
||||
int32_t num_outputs = 0;
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_MAX_SIZE],
|
||||
&max_output_size);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_IOU_THRES],
|
||||
&iou_threshold);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SCORE_THRES],
|
||||
&score_threshold);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SOFT_NMS_SIGMA],
|
||||
&soft_nms_sigma);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
for ( i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
|
||||
|
||||
vsi_nn_kernel_tensor_attr_get_stride( attr[i], stride_size[i] );
|
||||
buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[i], "Create input buffer fail.", final );
|
||||
}
|
||||
|
||||
for ( i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
f32_out_buffer[i] = (float *)malloc( out_elements[i] * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_elements[i] * sizeof(float) );
|
||||
}
|
||||
|
||||
num_boxes = attr[0]->shape->data[1];
|
||||
boxes = buffer[0];
|
||||
scores = buffer[1];
|
||||
selected_indices = f32_out_buffer[0];
|
||||
selected_scores = f32_out_buffer[1];
|
||||
num_selected_indices = f32_out_buffer[2];
|
||||
|
||||
candidate = (Candidate*)malloc(num_boxes * sizeof(Candidate));
|
||||
CHECK_PTR_FAIL_GOTO( candidate, "Create select buffer fail.", final );
|
||||
memset(candidate, 0, num_boxes * sizeof(Candidate));
|
||||
|
||||
for (i = 0; i < num_boxes; ++i)
|
||||
{
|
||||
if (scores[i] > score_threshold)
|
||||
{
|
||||
candidate[select_size].index = i;
|
||||
candidate[select_size].score = scores[i];
|
||||
candidate[select_size].suppress_begin_index = 0;
|
||||
select_size++;
|
||||
}
|
||||
}
|
||||
|
||||
num_outputs = vsi_nn_min(select_size, max_output_size);
|
||||
|
||||
if (num_outputs == 0)
|
||||
{
|
||||
num_selected_indices[0] = 0;
|
||||
}
|
||||
|
||||
if (soft_nms_sigma > 0.0f)
|
||||
{
|
||||
scale = -0.5f / soft_nms_sigma;
|
||||
}
|
||||
|
||||
select_len = 0;
|
||||
while (select_len < num_outputs && select_start < select_size)
|
||||
{
|
||||
int32_t j = 0;
|
||||
float original_score = 0;
|
||||
vsi_bool should_hard_suppress = FALSE;
|
||||
|
||||
// find max score and swap to the front.
|
||||
int32_t max_index = _max_element( &candidate[select_start], select_size - select_start);
|
||||
|
||||
if (max_index != select_size - select_start - 1)
|
||||
{
|
||||
_swap_element(&(candidate[select_start]), max_index, 0);
|
||||
}
|
||||
|
||||
original_score = candidate[select_start].score;
|
||||
// Calculate IoU of the rest, swap to the end (disgard) if needed.
|
||||
for ( j = select_len - 1; j >= candidate[select_start].suppress_begin_index; j-- )
|
||||
{
|
||||
int32_t idx = (int32_t)selected_indices[j];
|
||||
float iou = _computeIntersectionOverUnion(boxes, candidate[select_start].index, idx);
|
||||
|
||||
// First decide whether to perform hard suppression.
|
||||
if (iou >= iou_threshold)
|
||||
{
|
||||
should_hard_suppress = TRUE;
|
||||
break;
|
||||
}
|
||||
|
||||
// Suppress score if NMS sigma > 0.
|
||||
if (soft_nms_sigma > 0.0)
|
||||
{
|
||||
candidate[select_start].score =
|
||||
candidate[select_start].score * (float)exp(scale * iou * iou);
|
||||
}
|
||||
|
||||
if (candidate[select_start].score <= score_threshold)
|
||||
break;
|
||||
}
|
||||
|
||||
candidate[select_start].suppress_begin_index = select_len;
|
||||
if (!should_hard_suppress)
|
||||
{
|
||||
if (candidate[select_start].score == original_score)
|
||||
{
|
||||
// Suppression has not occurred, so select next_candidate.
|
||||
selected_indices[select_len] = (float)candidate[select_start].index;
|
||||
selected_scores[select_len] = candidate[select_start].score;
|
||||
++ select_len;
|
||||
}
|
||||
if ( candidate[select_start].score > score_threshold)
|
||||
{
|
||||
// Soft suppression might have occurred and current score is still
|
||||
// greater than score_threshold; add next_candidate back onto priority
|
||||
// queue.
|
||||
candidate[select_start].suppress_begin_index = select_len;
|
||||
}
|
||||
}
|
||||
|
||||
select_start ++;
|
||||
}
|
||||
|
||||
num_selected_indices[0] = (float)select_len;
|
||||
|
||||
for ( i = select_len; i < max_output_size; i++)
|
||||
{
|
||||
selected_indices[i] = 0;
|
||||
selected_scores[i] = 0;
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for ( i = 0; i < _OUTPUT_NUM; i++ )
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_nn_safe_free(candidate);
|
||||
for( i = 0; i < _INPUT_NUM; i ++ )
|
||||
{
|
||||
if ( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[i] );
|
||||
}
|
||||
|
||||
for ( i = 0; i < _OUTPUT_NUM; i++ )
|
||||
{
|
||||
if (f32_out_buffer[i])
|
||||
{
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _nms_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _nms_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_NMS_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t max_output_size = vsi_nn_kernel_param_get_int32(params, "max_output_size");
|
||||
float iou_threshold = vsi_nn_kernel_param_get_float32(params, "iou_threshold");
|
||||
float score_threshold = vsi_nn_kernel_param_get_float32(params, "score_threshold");
|
||||
float soft_nms_sigma = vsi_nn_kernel_param_get_float32(params, "soft_nms_sigma");
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _NMS_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
node_params[SCALAR_INPUT_MAX_SIZE] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &max_output_size );
|
||||
node_params[SCALAR_INPUT_IOU_THRES] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &iou_threshold );
|
||||
node_params[SCALAR_INPUT_SCORE_THRES] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &score_threshold );
|
||||
node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &soft_nms_sigma );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _NMS_PARAM_NUM );
|
||||
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MAX_SIZE] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_IOU_THRES] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCORE_THRES] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( nms, _setup )
|
||||
|
|
@ -0,0 +1,252 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.one_hot")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _one_hot_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
#define INPUT_SCALAR_DEPTH (2)
|
||||
#define INPUT_SCALAR_ON_VALUE (3)
|
||||
#define INPUT_SCALAR_OFF_VALUE (4)
|
||||
#define INPUT_SCALAR_AXIS (5)
|
||||
#define _ONE_HOT_PARAM_NUM _cnt_of_array( _one_hot_kernel_param_def )
|
||||
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
|
||||
float * buffer[_IO_NUM] = { NULL };
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
|
||||
int32_t i = 0;
|
||||
int32_t j = 0;
|
||||
int32_t k = 0;
|
||||
int32_t index = 0;
|
||||
int32_t depth = 0;
|
||||
float on_value = 0;
|
||||
float off_value = 0;
|
||||
int32_t axis = 0;
|
||||
int32_t prefix_dim_size = 1;
|
||||
int32_t suffix_dim_size = 0;
|
||||
int32_t num_elements = 0;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &depth);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &on_value);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &off_value);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
num_elements = (int32_t)vsi_nn_kernel_tensor_attr_get_size( attr[0] );
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
|
||||
buffer[1] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
|
||||
memset( buffer[1], 0, out_elements * sizeof(float) );
|
||||
|
||||
axis = axis == -1 ? (int32_t)attr[0]->shape->size : (int32_t)attr[0]->shape->size - axis;
|
||||
|
||||
for (i = 0; i < axis; i++)
|
||||
{
|
||||
prefix_dim_size *= attr[0]->shape->data[i];
|
||||
}
|
||||
|
||||
suffix_dim_size = num_elements / prefix_dim_size;
|
||||
|
||||
for (i = 0; i < prefix_dim_size; i++)
|
||||
{
|
||||
for (j = 0; j < depth; j++)
|
||||
{
|
||||
for (k = 0; k < suffix_dim_size; k++)
|
||||
{
|
||||
int32_t value = (int32_t)buffer[0][i * suffix_dim_size + k];
|
||||
buffer[1][index ++] = value == j ? on_value : off_value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
|
||||
buffer[1], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(attr[0]);
|
||||
SAFE_FREE_TENSOR_ATTR(attr[1]);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
for ( i = 0; i < _IO_NUM; i ++ )
|
||||
{
|
||||
if ( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
buffer[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _one_hot_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _one_hot_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
|
||||
float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
|
||||
float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[INPUT_SCALAR_DEPTH] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &depth );
|
||||
node_params[INPUT_SCALAR_ON_VALUE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &on_value );
|
||||
node_params[INPUT_SCALAR_OFF_VALUE] = vsi_nn_kernel_scalar_create(
|
||||
graph, F32, &off_value );
|
||||
node_params[INPUT_SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &axis );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
|
||||
CHECK_STATUS_FAIL_GOTO( status, OnError );
|
||||
}
|
||||
}
|
||||
OnError:
|
||||
if (node_params[INPUT_SCALAR_DEPTH])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_DEPTH] );
|
||||
}
|
||||
|
||||
if (node_params[INPUT_SCALAR_ON_VALUE])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_ON_VALUE] );
|
||||
}
|
||||
|
||||
if (node_params[INPUT_SCALAR_OFF_VALUE])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_OFF_VALUE] );
|
||||
}
|
||||
|
||||
if (node_params[INPUT_SCALAR_AXIS])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_AXIS] );
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( one_hot, _setup )
|
||||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@
|
|||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,286 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _CPU_ARG_NUM (1)
|
||||
#define _CPU_INPUT_NUM (2)
|
||||
#define _CPU_OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.repeat")
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_repeat_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
|
||||
float * buffer[_CPU_IO_NUM] = { NULL };
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
|
||||
int32_t i = 0, j = 0, b = 0, c = 0;
|
||||
int32_t axis = 0;
|
||||
int32_t outerSize = 1;
|
||||
int32_t outIdx = 0;
|
||||
int32_t width = 0, height = 0, channel = 0, batch = 0;
|
||||
int32_t spatial = 0, vol = 0;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
|
||||
|
||||
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input0 buffer fail.", final );
|
||||
|
||||
buffer[2] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
|
||||
memset( buffer[2], 0, out_elements * sizeof(float) );
|
||||
|
||||
width = attr[0]->shape->data[0];
|
||||
height = attr[0]->shape->data[1];
|
||||
channel = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
|
||||
batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
|
||||
spatial = width * height;
|
||||
vol = spatial * channel;
|
||||
|
||||
for(i = 1; i < (int32_t)attr[0]->shape->size; i++)
|
||||
{
|
||||
outerSize *= attr[0]->shape->data[i];
|
||||
}
|
||||
|
||||
if (axis == 0 && outerSize == 1)
|
||||
{
|
||||
for(i = 0; i < width; i++)
|
||||
{
|
||||
float data = buffer[0][i];
|
||||
int32_t len = (int32_t)buffer[1][i];
|
||||
for(j = 0; j < len; j++)
|
||||
{
|
||||
buffer[2][outIdx] = data;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axis == 0)
|
||||
{
|
||||
for(b = 0; b < batch; b++)
|
||||
{
|
||||
for(c = 0; c < channel; c++)
|
||||
{
|
||||
for(i = 0; i < height; i++)
|
||||
{
|
||||
int32_t len = (int32_t)buffer[1][i];
|
||||
int32_t offset = i * width + c * spatial + b * vol;
|
||||
for(j = 0; j < len; j++)
|
||||
{
|
||||
memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * width);
|
||||
outIdx += width;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axis == 1)
|
||||
{
|
||||
for(b = 0; b < batch; b++)
|
||||
{
|
||||
for(c = 0; c < channel; c++)
|
||||
{
|
||||
for(i = 0; i < height; i++)
|
||||
{
|
||||
int32_t offset = i * width + c * spatial + b * vol;
|
||||
for(j = 0; j < width; j++)
|
||||
{
|
||||
int32_t len = (int32_t)buffer[1][j];
|
||||
float data = buffer[0][offset + j];
|
||||
int32_t k = 0;
|
||||
for(k = 0; k < len; k++)
|
||||
{
|
||||
buffer[2][outIdx++] = data;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (axis == 2)
|
||||
{
|
||||
for(b = 0; b < batch; b++)
|
||||
{
|
||||
for(c = 0; c < channel; c++)
|
||||
{
|
||||
int32_t len = (int32_t)buffer[1][c];
|
||||
int32_t offset = c * spatial + b * vol;
|
||||
|
||||
for(j = 0; j < len; j++)
|
||||
{
|
||||
memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * spatial);
|
||||
outIdx += spatial;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
VSILOGE("axis is not support");
|
||||
status = VSI_FAILURE;
|
||||
goto final;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
|
||||
buffer[2], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
for( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
}
|
||||
for( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
|
||||
}
|
||||
return status;
|
||||
} /* _repeat_exec() */
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _repeat_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _REPEAT_PARAM_NUM _cnt_of_array( _repeat_kernel_param_def )
|
||||
|
||||
static const vx_kernel_description_t _kernel_info =
|
||||
{
|
||||
KERNEL_ID_PLACEHOLDER,
|
||||
_KERNEL_NAME,
|
||||
_repeat_exec,
|
||||
_repeat_kernel_param_def,
|
||||
_cnt_of_array( _repeat_kernel_param_def ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
|
||||
return VSI_SUCCESS;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel );
|
||||
if( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
|
||||
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
|
||||
backend_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
||||
CHECK_STATUS( status );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[3] );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( repeat, _setup )
|
||||
|
||||
|
|
@ -35,7 +35,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,248 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define _CPU_ARG_NUM (1)
|
||||
#define _CPU_INPUT_NUM (1)
|
||||
#define _CPU_OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("sequence_mask_sw")
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_sequence_mask_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_SUCCESS;
|
||||
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
|
||||
float * buffer_in = NULL;
|
||||
float * buffer = NULL;
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
|
||||
uint32_t i = 0;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
|
||||
|
||||
buffer_in = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer_in, "Create input0 buffer fail.", final );
|
||||
|
||||
buffer = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer, "Create output buffer fail.", final );
|
||||
memset( buffer, 0, out_elements * sizeof(float) );
|
||||
|
||||
{
|
||||
uint32_t j = 0;
|
||||
uint32_t height = attr[1]->shape->data[1];
|
||||
uint32_t width = attr[1]->shape->data[0];
|
||||
|
||||
for(j = 0; j < height; j++)
|
||||
{
|
||||
uint32_t idx_in = (uint32_t)buffer_in[j];
|
||||
uint32_t out_offset = j * width;
|
||||
idx_in = idx_in > width ? width : idx_in;
|
||||
for(i = 0; i < idx_in; i++)
|
||||
{
|
||||
buffer[out_offset + i] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
|
||||
buffer, out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
if (buffer_in)
|
||||
{
|
||||
free( buffer_in );
|
||||
}
|
||||
if (buffer)
|
||||
{
|
||||
free( buffer );
|
||||
}
|
||||
for( i = 0; i < _CPU_IO_NUM; i ++ )
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[i] );
|
||||
}
|
||||
return status;
|
||||
} /* _sequence_mask_exec() */
|
||||
|
||||
static vx_param_description_t kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
|
||||
static const vx_kernel_description_t _kernel_info =
|
||||
{
|
||||
KERNEL_ID_PLACEHOLDER,
|
||||
_KERNEL_NAME,
|
||||
_sequence_mask_exec,
|
||||
kernel_param_def,
|
||||
_cnt_of_array( kernel_param_def ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
|
||||
return VSI_SUCCESS;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static int32_t _optimize_mask_shape
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
int32_t max_len,
|
||||
int32_t* opt_shape_in,
|
||||
int32_t* opt_shape_out
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
int32_t out_size = 1;
|
||||
uint32_t i = 0;
|
||||
opt_shape_in[0] = 1;
|
||||
opt_shape_in[1] = 1;
|
||||
for(i = 0; i < inputs[0]->attr.dim_num; i++)
|
||||
{
|
||||
opt_shape_in[0] *= inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
for(i = 0; i < outputs[0]->attr.dim_num; i++)
|
||||
{
|
||||
out_size *= outputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
opt_shape_out[0] = max_len;
|
||||
opt_shape_out[1] = out_size / max_len;
|
||||
|
||||
if (out_size % max_len != 0)
|
||||
{
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
|
||||
int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }};
|
||||
int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" );
|
||||
|
||||
status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1]);
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2);
|
||||
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 2);
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
uint32_t index = 0;
|
||||
/* Pass parameters to node. */
|
||||
backend_params[index++] = rs_input;
|
||||
backend_params[index++] = rs_output;
|
||||
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &backend_params[2] );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
final:
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_output );
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( sequence_mask, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,246 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.slice")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _slice_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _SLICE_PARAM_NUM _cnt_of_array( _slice_kernel_param_def )
|
||||
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
int32_t rank = 0;
|
||||
int32_t i = 0;
|
||||
int32_t in_w = 0;
|
||||
int32_t in_h = 0;
|
||||
int32_t in_c = 0;
|
||||
int32_t in_b = 0;
|
||||
int32_t start[4] = {0};
|
||||
int32_t stop[4] = {0};
|
||||
int32_t in_size[4] = {1, 1, 1, 1};
|
||||
int32_t out_size[4] = {1, 1, 1, 1};
|
||||
float *input_ptr = NULL;
|
||||
float *output_ptr = NULL;
|
||||
int32_t dstIdx = 0;
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
|
||||
}
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
|
||||
rank = (int32_t)out_attr[0]->shape->size;
|
||||
|
||||
for (i = 0; i < rank; i++)
|
||||
{
|
||||
in_size[i] = in_attr[0]->shape->data[i];
|
||||
out_size[i] = out_attr[0]->shape->data[i];
|
||||
}
|
||||
|
||||
start[0] = (int32_t)f32_in_buffer[1][0];
|
||||
stop[0] = start[0] + out_attr[0]->shape->data[0];
|
||||
start[1] = rank < 2 ? 0 : (int32_t)f32_in_buffer[1][1];
|
||||
stop[1] = rank < 2 ? 1 : start[1] + out_size[1];
|
||||
start[2] = rank < 3 ? 0 : (int32_t)f32_in_buffer[1][2];
|
||||
stop[2] = rank < 3 ? 1 : start[2] + out_size[2];
|
||||
start[3] = rank < 4 ? 0 : (int32_t)f32_in_buffer[1][3];
|
||||
stop[3] = rank < 4 ? 1 : start[3] + out_size[3];
|
||||
input_ptr = f32_in_buffer[0];
|
||||
output_ptr = f32_out_buffer[0];
|
||||
|
||||
for (in_b = start[3]; in_b < stop[3]; ++in_b)
|
||||
{
|
||||
for (in_c = start[2]; in_c < stop[2]; ++in_c)
|
||||
{
|
||||
for (in_h = start[1]; in_h < stop[1]; ++in_h)
|
||||
{
|
||||
for (in_w = start[0]; in_w < stop[0]; ++in_w)
|
||||
{
|
||||
int32_t srcIdx = ((in_b * in_size[2] + in_c) * in_size[1] + in_h) * in_size[0] + in_w;
|
||||
output_ptr[dstIdx ++] = input_ptr[srcIdx];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
if (f32_in_buffer[i])
|
||||
{
|
||||
free(f32_in_buffer[i]);
|
||||
f32_in_buffer[i] = NULL;
|
||||
}
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (f32_out_buffer[i])
|
||||
{
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _slice_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _slice_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( slice, _setup )
|
||||
|
|
@ -35,7 +35,7 @@
|
|||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@
|
|||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "client/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,297 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (2)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.topk")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _topk_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _TOPK_PARAM_NUM _cnt_of_array( _topk_kernel_param_def )
|
||||
|
||||
static uint32_t _max_comp_func(void* data, int32_t left, int32_t right)
|
||||
{
|
||||
float* fdata = (float*)data;
|
||||
if (fdata[left] >= fdata[right])
|
||||
{
|
||||
return TRUE;
|
||||
}
|
||||
else
|
||||
{
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
static void _find_top_k_1d
|
||||
(
|
||||
float* input,
|
||||
uint32_t input_len,
|
||||
uint32_t k,
|
||||
float* value,
|
||||
uint32_t* indices
|
||||
)
|
||||
{
|
||||
int32_t low = 0;
|
||||
int32_t high = input_len - 1;
|
||||
int32_t j;
|
||||
|
||||
for (j = 0; j < (int32_t)input_len; j++)
|
||||
{
|
||||
indices[j] = j;
|
||||
}
|
||||
|
||||
j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices);
|
||||
|
||||
//part_sort
|
||||
while (j != (int32_t)k)
|
||||
{
|
||||
if ((int32_t)k > j)
|
||||
{
|
||||
low = j + 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
high = j;
|
||||
}
|
||||
j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices);
|
||||
}
|
||||
//all_sort
|
||||
vsi_nn_partition(input, 0, k - 1, _max_comp_func, TRUE, indices);
|
||||
|
||||
for (j = 0; j < (int32_t)k; j++)
|
||||
{
|
||||
value[j] = input[indices[j]];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
uint32_t i = 0;
|
||||
int32_t j = 0;
|
||||
int32_t top_k = 0;
|
||||
uint32_t block_num = 0;
|
||||
uint32_t block_size = 0;
|
||||
uint32_t * indices_ptr = NULL;
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
|
||||
}
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32( param[3], &top_k );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
block_num = in_attr[0]->shape->data[1];
|
||||
block_size = in_attr[0]->shape->data[0];
|
||||
indices_ptr = (uint32_t*)malloc(block_size * sizeof(uint32_t));
|
||||
CHECK_PTR_FAIL_GOTO( indices_ptr, "Create indices buffer fail.", final );
|
||||
|
||||
for(i = 0; i < block_num; i++)
|
||||
{
|
||||
uint32_t in_index = i * block_size;
|
||||
uint32_t out_index = i * top_k;
|
||||
_find_top_k_1d(&(f32_in_buffer[0][in_index]),
|
||||
block_size, top_k, &(f32_out_buffer[0][out_index]), indices_ptr);
|
||||
|
||||
for (j = 0; j < top_k; j++)
|
||||
{
|
||||
f32_out_buffer[1][out_index + j] = (float)indices_ptr[j];
|
||||
}
|
||||
}
|
||||
// Handle the 1D input
|
||||
if (!block_num)
|
||||
{
|
||||
_find_top_k_1d(&(f32_in_buffer[0][0]),
|
||||
block_size, top_k, &(f32_out_buffer[0][0]), indices_ptr);
|
||||
for (j = 0; j < top_k; j++)
|
||||
{
|
||||
f32_out_buffer[1][j] = (float)indices_ptr[j];
|
||||
}
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
vsi_nn_safe_free(indices_ptr);
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
if (f32_in_buffer[i])
|
||||
{
|
||||
free(f32_in_buffer[i]);
|
||||
f32_in_buffer[i] = NULL;
|
||||
}
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (f32_out_buffer[i])
|
||||
{
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _topk_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _topk_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &top_k );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
|
||||
|
||||
vsi_nn_kernel_scalar_release( &node_params[3] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( topk, _setup )
|
||||
|
||||
|
|
@ -44,23 +44,26 @@ typedef enum _internal_img_dim_e
|
|||
IMAGE_2D,
|
||||
} internal_img_dim_e;
|
||||
|
||||
#define _BATCH_NORM_KERNEL_SOURCE "batchnorm_single"
|
||||
#define SOURCE0 "batchnorm_single"
|
||||
#define SOURCE1 "batchnorm_single_f32"
|
||||
|
||||
#define STR(a) #a
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define BATCH_NORM_HASH_KEY(IN_DTYPE, OUT_DTYPE, BRDCST, _image_2d) \
|
||||
( ( IN_DTYPE << 16 ) | ( OUT_DTYPE << 3) | ( BRDCST << 1) | (_image_2d) )
|
||||
#define BATCH_NORM_HASH_KEY(IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, _image_2d) \
|
||||
( ( IN_DTYPE << 24 ) | ( GAMMA_TYPE << 16 ) | ( OUT_DTYPE << 3) | ( BRDCST << 1) | (_image_2d) )
|
||||
|
||||
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, BRDCST) \
|
||||
{ BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE), \
|
||||
CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST), \
|
||||
_BATCH_NORM_KERNEL_SOURCE}
|
||||
#define PACK_KERNEL_MAP( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, source) \
|
||||
{ BATCH_NORM_HASH_KEY( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, IMAGE), \
|
||||
CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"_F16_F16_" \
|
||||
STR(GAMMA_TYPE)"_F32to"STR(OUT_DTYPE)"_brdcst"#BRDCST), \
|
||||
source}
|
||||
|
||||
#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE, BRDCST) \
|
||||
{ BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE_2D), \
|
||||
CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST"_2D"), \
|
||||
_BATCH_NORM_KERNEL_SOURCE}
|
||||
#define PACK_KERNEL_MAP_2D( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, source) \
|
||||
{ BATCH_NORM_HASH_KEY( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, IMAGE_2D), \
|
||||
CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"_F16_F16_" \
|
||||
STR(GAMMA_TYPE)"_F32to"STR(OUT_DTYPE)"_brdcst"#BRDCST"_2D"), \
|
||||
source}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
|
@ -71,47 +74,89 @@ typedef struct
|
|||
|
||||
static const _kernel_map_type _batch_norm_kernel_map[] =
|
||||
{
|
||||
PACK_KERNEL_MAP(F16, F16, 0),
|
||||
PACK_KERNEL_MAP(F16, I16, 0),
|
||||
PACK_KERNEL_MAP(F16, U8, 0),
|
||||
PACK_KERNEL_MAP(F16, I8, 0),
|
||||
PACK_KERNEL_MAP(U8, U8, 0),
|
||||
PACK_KERNEL_MAP(U8, F16, 0),
|
||||
PACK_KERNEL_MAP(I8, I8, 0),
|
||||
PACK_KERNEL_MAP(I8, F16, 0),
|
||||
PACK_KERNEL_MAP(I16, I16, 0),
|
||||
PACK_KERNEL_MAP(I16, F16, 0),
|
||||
PACK_KERNEL_MAP(F16, F16, 1),
|
||||
PACK_KERNEL_MAP(F16, I16, 1),
|
||||
PACK_KERNEL_MAP(F16, U8, 1),
|
||||
PACK_KERNEL_MAP(F16, I8, 1),
|
||||
PACK_KERNEL_MAP(U8, U8, 1),
|
||||
PACK_KERNEL_MAP(U8, F16, 1),
|
||||
PACK_KERNEL_MAP(I8, I8, 1),
|
||||
PACK_KERNEL_MAP(I8, F16, 1),
|
||||
PACK_KERNEL_MAP(I16, I16, 1),
|
||||
PACK_KERNEL_MAP(I16, F16, 1),
|
||||
PACK_KERNEL_MAP(F16, F16, F16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(F16, F16, I16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(F16, F16, U8, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(F16, F16, I8, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(U8, F16, U8, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(U8, F16, F16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(I8, F16, I8, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(I8, F16, F16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(I16, F16, I16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(I16, F16, F16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP(F16, F16, F16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP(F16, F16, I16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP(F16, F16, U8, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP(F16, F16, I8, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP(U8, F16, U8, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP(U8, F16, F16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP(I8, F16, I8, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP(I8, F16, F16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP(I16, F16, I16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP(I16, F16, F16, 1, SOURCE0),
|
||||
|
||||
PACK_KERNEL_MAP_2D(F16, F16, 0),
|
||||
PACK_KERNEL_MAP_2D(F16, I16, 0),
|
||||
PACK_KERNEL_MAP_2D(F16, U8 , 0),
|
||||
PACK_KERNEL_MAP_2D(F16, I8 , 0),
|
||||
PACK_KERNEL_MAP_2D(U8, U8 , 0),
|
||||
PACK_KERNEL_MAP_2D(U8, F16, 0),
|
||||
PACK_KERNEL_MAP_2D(I8, I8, 0),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, 0),
|
||||
PACK_KERNEL_MAP_2D(I16, I16, 0),
|
||||
PACK_KERNEL_MAP_2D(I16, F16, 0),
|
||||
PACK_KERNEL_MAP_2D(F16, F16, 1),
|
||||
PACK_KERNEL_MAP_2D(F16, I16, 1),
|
||||
PACK_KERNEL_MAP_2D(F16, U8 , 1),
|
||||
PACK_KERNEL_MAP_2D(F16, I8 , 1),
|
||||
PACK_KERNEL_MAP_2D(U8, U8 , 1),
|
||||
PACK_KERNEL_MAP_2D(U8, F16, 1),
|
||||
PACK_KERNEL_MAP_2D(I8, I8, 1),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, 1),
|
||||
PACK_KERNEL_MAP_2D(I16, I16, 1),
|
||||
PACK_KERNEL_MAP_2D(I16, F16, 1),
|
||||
PACK_KERNEL_MAP(F16, F32, F16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(F16, F32, I16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(F16, F32, U8, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(F16, F32, I8, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(U8, F32, U8, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(U8, F32, F16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(I8, F32, I8, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(I8, F32, F16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(I16, F32, I16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(I16, F32, F16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP(F16, F32, F16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP(F16, F32, I16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP(F16, F32, U8, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP(F16, F32, I8, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP(U8, F32, U8, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP(U8, F32, F16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP(I8, F32, I8, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP(I8, F32, F16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP(I16, F32, I16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP(I16, F32, F16, 1, SOURCE1),
|
||||
|
||||
PACK_KERNEL_MAP_2D(F16, F16, F16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(F16, F16, I16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(F16, F16, U8, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(F16, F16, I8, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(U8, F16, U8, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(U8, F16, F16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, I8, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, F16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(I16, F16, I16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(I16, F16, F16, 0, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(F16, F16, F16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(F16, F16, I16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(F16, F16, U8, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(F16, F16, I8, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(U8, F16, U8, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(U8, F16, F16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, I8, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(I8, F16, F16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(I16, F16, I16, 1, SOURCE0),
|
||||
PACK_KERNEL_MAP_2D(I16, F16, F16, 1, SOURCE0),
|
||||
|
||||
PACK_KERNEL_MAP_2D(F16, F32, F16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(F16, F32, I16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(F16, F32, U8, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(F16, F32, I8, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(U8, F32, U8, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(U8, F32, F16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(I8, F32, I8, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(I8, F32, F16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(I16, F32, I16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(I16, F32, F16, 0, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(F16, F32, F16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(F16, F32, I16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(F16, F32, U8, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(F16, F32, I8, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(U8, F32, U8, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(U8, F32, F16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(I8, F32, I8, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(I8, F32, F16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(I16, F32, I16, 1, SOURCE1),
|
||||
PACK_KERNEL_MAP_2D(I16, F32, F16, 1, SOURCE1),
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -329,6 +374,7 @@ static vsi_status _query_kernel
|
|||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e gamma_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _batch_norm_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _batch_norm_kernel_map );
|
||||
|
|
@ -340,6 +386,7 @@ static vsi_status _query_kernel
|
|||
uint32_t brdcst = 0;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
gamma_dtype = vsi_nn_kernel_map_dtype( inputs[3]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (inputs[BATCHNORM_INPUT]->attr.size[0] != 1 && inputs[BATCHNORM_INPUT_BETA]->attr.size[0] == 1)
|
||||
|
|
@ -347,7 +394,7 @@ static vsi_status _query_kernel
|
|||
brdcst = 1;
|
||||
}
|
||||
|
||||
key = BATCH_NORM_HASH_KEY(in_dtype, out_dtype, brdcst, image_2d);
|
||||
key = BATCH_NORM_HASH_KEY(in_dtype, gamma_dtype, out_dtype, brdcst, image_2d);
|
||||
|
||||
for( i = 0; i < kernel_map_size; i ++ )
|
||||
{
|
||||
|
|
@ -397,7 +444,6 @@ static vsi_nn_kernel_node_t _setup
|
|||
if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const)
|
||||
|| (inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
|
||||
|| (inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
|
||||
|| (inputs[3]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
|
||||
|| (inputs[4]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32) )
|
||||
{
|
||||
return NULL;
|
||||
|
|
|
|||
|
|
@ -241,6 +241,7 @@ static vsi_status _query_kernel
|
|||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in_dtype = in_dtype == BOOL8 ? I8 : in_dtype;
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = CAST_HASH_KEY( in_dtype, out_dtype, image_2d );
|
||||
|
|
|
|||
|
|
@ -455,6 +455,7 @@ static vsi_status _query_kernel
|
|||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = output_dtype == I8 ? BOOL8 : output_dtype;
|
||||
key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(_comparisons_evis_kernel_map); i ++ )
|
||||
|
|
|
|||
|
|
@ -0,0 +1,702 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
NORMAL = 0,
|
||||
K3_S1,
|
||||
K3_S1_D2_D4,
|
||||
K1024_SMALL,
|
||||
K1024_LARGE,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define _CONV1D_OVXLIB_KERNEL_SOURCE "conv1d_ovxlib"
|
||||
#define _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 "conv1d_ovxlib_k1024"
|
||||
|
||||
#define STR(a) #a
|
||||
// Add kernel hashtable here
|
||||
#define CONV1D_OVXLIB_HASH_KEY( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE ) \
|
||||
(( KERNEL_TYPE << 24 ) | ( IN_DTYPE << 18 ) | ( W_DTYPE << 12 ) | ( B_DTYPE << 6 ) | ( OUT_DTYPE ))
|
||||
#define PACK_KERNEL_MAP( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE, SOURCE ) \
|
||||
{ CONV1D_OVXLIB_HASH_KEY( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE ), \
|
||||
CVIVANTE_NAMESPACE(\
|
||||
"evis.conv1d_"STR(IN_DTYPE)STR(W_DTYPE)STR(B_DTYPE)"to"STR(OUT_DTYPE)"_"STR(KERNEL_TYPE)), \
|
||||
SOURCE }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _conv1d_ovxlib_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_MAP( U8, U8, I32, U8, K3_S1, _CONV1D_OVXLIB_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP( U8, U8, I32, U8, K3_S1_D2_D4, _CONV1D_OVXLIB_KERNEL_SOURCE ),
|
||||
PACK_KERNEL_MAP( U8, U8, I32, U8, K1024_SMALL, _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 ),
|
||||
PACK_KERNEL_MAP( U8, U8, I32, U8, K1024_LARGE, _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 ),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _conv1d_ovxlib_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _CONV1D_OVXLIB_PARAM_NUM _cnt_of_array( _conv1d_ovxlib_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * weights_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
|
||||
vsi_int_array_t * in_shape = NULL;
|
||||
vsi_int_array_t * out_shape = NULL;
|
||||
vsi_int_array_t * weight_shape = NULL;
|
||||
float scaleIn = 1.0f;
|
||||
float scaleOut = 1.0f;
|
||||
float scaleWights = 1.0f;
|
||||
int32_t input_ZP = 0;
|
||||
int32_t weight_ZP = 0;
|
||||
float output_ZP = 0;
|
||||
int32_t stride = 1;
|
||||
int32_t dilation = 0;
|
||||
int32_t input_height = 0;
|
||||
int32_t input_width = 0;
|
||||
int32_t output_width = 0;
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
weights_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( weights_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(stride));
|
||||
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &(dilation));
|
||||
|
||||
in_shape = input_attr->shape;
|
||||
out_shape = output_attr->shape;
|
||||
weight_shape = weights_attr->shape;
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
{
|
||||
input_ZP = input_attr->asymm.zero_point;
|
||||
scaleIn = input_attr->asymm.scale;
|
||||
}
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_ASYMM == weights_attr->quant )
|
||||
{
|
||||
weight_ZP = weights_attr->asymm.zero_point;
|
||||
scaleWights = weights_attr->asymm.scale;
|
||||
}
|
||||
|
||||
if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
|
||||
{
|
||||
output_ZP = (float)output_attr->asymm.zero_point;
|
||||
scaleOut = output_attr->asymm.scale;
|
||||
}
|
||||
|
||||
scaleOut = (scaleIn * scaleWights) / scaleOut;
|
||||
input_height = in_shape->data[1];
|
||||
input_width = in_shape->data[0];
|
||||
output_width = out_shape->data[0];
|
||||
|
||||
if ((U8 == input_attr->dtype) && (U8 == weights_attr->dtype) && (U8 == output_attr->dtype))
|
||||
{
|
||||
gpu_dp_inst_t uniSumOrderUchar_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x0c080400, 0x0c080400, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
if ( (3 == weight_shape->data[0]) && (1 == stride) )
|
||||
{
|
||||
gpu_dp_inst_t uniConv1DK3_Lo0_4x4 = {{
|
||||
0x69696969, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x41014000, 0x43034202, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x55405540, 0x55405540, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConv1DK3_Lo1_4x4 = {{
|
||||
0x69696969, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x41114010, 0x43134212, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x55415541, 0x55415541, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConv1DK3_Lo2_4x4 = {{
|
||||
0x69696969, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x41214020, 0x43234222, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x55425542, 0x55425542, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConv1DK3_Hi0_4x4 = {{
|
||||
0x69696969, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x45054404, 0x47074606, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x55405540, 0x55405540, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConv1DK3_Hi1_4x4 = {{
|
||||
0x69696969, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x45154414, 0x47174616, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x55415541, 0x55415541, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniConv1DK3_Hi2_4x4 = {{
|
||||
0x69696969, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x45254424, 0x47274626, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x55425542, 0x55425542, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniDataConvK3_2x8 = {{
|
||||
0x00111111, // TCfg
|
||||
0x00110000, // ASelt
|
||||
0x03020100, 0x00000504, // ABin
|
||||
0x00222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
uint32_t conv1dK3D2_Lo1[4] = {0x43134212, 0x45154414, 0x55415541, 0x55415541};
|
||||
uint32_t conv1dK3D2_Lo2[4] = {0x45254424, 0x47274626, 0x55425542, 0x55425542};
|
||||
uint32_t conv1dK3D2_Hi1[4] = {0x47174616, 0x49194818, 0x55415541, 0x55415541};
|
||||
uint32_t conv1dK3D2_Hi2[4] = {0x49294828, 0x4b2b4a2a, 0x55425542, 0x55425542};
|
||||
uint32_t conv1dK3D4_Lo1[4] = {0x45154414, 0x47174616, 0x55415541, 0x55415541};
|
||||
uint32_t conv1dK3D4_Lo2[4] = {0x49294828, 0x4b2b4a2a, 0x55425542, 0x55425542};
|
||||
uint32_t conv1dK3D4_Hi1[4] = {0x49194818, 0x4b1b4a1a, 0x55415541, 0x55415541};
|
||||
uint32_t conv1dK3D4_Hi2[4] = {0x4d2d4c2c, 0x4f2f4e2e, 0x55425542, 0x55425542};
|
||||
|
||||
if (2 == dilation)
|
||||
{
|
||||
uniConv1DK3_Lo1_4x4.data[2] = conv1dK3D2_Lo1[0];
|
||||
uniConv1DK3_Lo1_4x4.data[3] = conv1dK3D2_Lo1[1];
|
||||
uniConv1DK3_Lo1_4x4.data[5] = conv1dK3D2_Lo1[2];
|
||||
uniConv1DK3_Lo1_4x4.data[6] = conv1dK3D2_Lo1[3];
|
||||
uniConv1DK3_Lo2_4x4.data[2] = conv1dK3D2_Lo2[0];
|
||||
uniConv1DK3_Lo2_4x4.data[3] = conv1dK3D2_Lo2[1];
|
||||
uniConv1DK3_Lo2_4x4.data[5] = conv1dK3D2_Lo2[2];
|
||||
uniConv1DK3_Lo2_4x4.data[6] = conv1dK3D2_Lo2[3];
|
||||
uniConv1DK3_Hi1_4x4.data[2] = conv1dK3D2_Hi1[0];
|
||||
uniConv1DK3_Hi1_4x4.data[3] = conv1dK3D2_Hi1[1];
|
||||
uniConv1DK3_Hi1_4x4.data[5] = conv1dK3D2_Hi1[2];
|
||||
uniConv1DK3_Hi1_4x4.data[6] = conv1dK3D2_Hi1[3];
|
||||
uniConv1DK3_Hi2_4x4.data[2] = conv1dK3D2_Hi2[0];
|
||||
uniConv1DK3_Hi2_4x4.data[3] = conv1dK3D2_Hi2[1];
|
||||
uniConv1DK3_Hi2_4x4.data[5] = conv1dK3D2_Hi2[2];
|
||||
uniConv1DK3_Hi2_4x4.data[6] = conv1dK3D2_Hi2[3];
|
||||
}
|
||||
else if (4 == dilation)
|
||||
{
|
||||
uniConv1DK3_Lo1_4x4.data[2] = conv1dK3D4_Lo1[0];
|
||||
uniConv1DK3_Lo1_4x4.data[3] = conv1dK3D4_Lo1[1];
|
||||
uniConv1DK3_Lo1_4x4.data[5] = conv1dK3D4_Lo1[2];
|
||||
uniConv1DK3_Lo1_4x4.data[6] = conv1dK3D4_Lo1[3];
|
||||
uniConv1DK3_Lo2_4x4.data[2] = conv1dK3D4_Lo2[0];
|
||||
uniConv1DK3_Lo2_4x4.data[3] = conv1dK3D4_Lo2[1];
|
||||
uniConv1DK3_Lo2_4x4.data[5] = conv1dK3D4_Lo2[2];
|
||||
uniConv1DK3_Lo2_4x4.data[6] = conv1dK3D4_Lo2[3];
|
||||
uniConv1DK3_Hi1_4x4.data[2] = conv1dK3D4_Hi1[0];
|
||||
uniConv1DK3_Hi1_4x4.data[3] = conv1dK3D4_Hi1[1];
|
||||
uniConv1DK3_Hi1_4x4.data[5] = conv1dK3D4_Hi1[2];
|
||||
uniConv1DK3_Hi1_4x4.data[6] = conv1dK3D4_Hi1[3];
|
||||
uniConv1DK3_Hi2_4x4.data[2] = conv1dK3D4_Hi2[0];
|
||||
uniConv1DK3_Hi2_4x4.data[3] = conv1dK3D4_Hi2[1];
|
||||
uniConv1DK3_Hi2_4x4.data[5] = conv1dK3D4_Hi2[2];
|
||||
uniConv1DK3_Hi2_4x4.data[6] = conv1dK3D4_Hi2[3];
|
||||
}
|
||||
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConv1DK3_Lo0_4x4", &uniConv1DK3_Lo0_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConv1DK3_Hi0_4x4", &uniConv1DK3_Hi0_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConv1DK3_Lo1_4x4", &uniConv1DK3_Lo1_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConv1DK3_Lo2_4x4", &uniConv1DK3_Lo2_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConv1DK3_Hi1_4x4", &uniConv1DK3_Hi1_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConv1DK3_Hi2_4x4", &uniConv1DK3_Hi2_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniDataConvK3_2x8", &uniDataConvK3_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumOrderUchar_2x8", &uniSumOrderUchar_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &input_ZP);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if ( (1024 == weight_shape->data[0]) && (1 == stride) )
|
||||
{
|
||||
gpu_dp_inst_t uniU8SubZp_lo_2x8= {{
|
||||
0x99999999, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniU8SubZp_hi_2x8= {{
|
||||
0x99999999, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x0b0a0908, 0x0f0e0d0c, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniU8Conv1d_part0_8x2= {{
|
||||
0x55555555, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x76543210, 0x87654321, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x76543210, 0x76543210, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniU8Conv1d_part1_8x2= {{
|
||||
0x55555555, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x98765432, 0xa9876543, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x76543210, 0x76543210, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniU8Conv1d_part2_8x2= {{
|
||||
0x55555555, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0xba987654, 0xcba98765, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x76543210, 0x76543210, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniU8Conv1d_part3_8x2= {{
|
||||
0x55555555, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0xdcba9876, 0xedcba987, // ABin
|
||||
0x55555555, // BSelt
|
||||
0x76543210, 0x76543210, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
int32_t kernel_cnt_x16 = (weight_shape->data[0] + 15) / 16;
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"kernel_cnt_x16", &kernel_cnt_x16 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8SubZp_lo_2x8", &uniU8SubZp_lo_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8SubZp_hi_2x8", &uniU8SubZp_hi_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8Conv1d_part0_8x2", &uniU8Conv1d_part0_8x2 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8Conv1d_part1_8x2", &uniU8Conv1d_part1_8x2 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8Conv1d_part2_8x2", &uniU8Conv1d_part2_8x2 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8Conv1d_part3_8x2", &uniU8Conv1d_part3_8x2 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniSumOrderUchar_2x8", &uniSumOrderUchar_2x8 );
|
||||
if (input_width >= GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_width", &input_width);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &output_width);
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "weight_ZP", &weight_ZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "scaleOut", &scaleOut);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_height", &input_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.dim = 2;
|
||||
gpu_param.global_size[0] = (
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0]);
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(input_attr);
|
||||
|
||||
return status;
|
||||
} /* _conv1d_ovxlib_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
_internal_kernel_e kernel_type
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e w_dtype;
|
||||
vsi_nn_kernel_dtype_e b_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _conv1d_ovxlib_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _conv1d_ovxlib_kernel_map );
|
||||
vx_param_description_t * param_def = _conv1d_ovxlib_kernel_param_def;
|
||||
size_t param_def_size = _cnt_of_array( _conv1d_ovxlib_kernel_param_def );
|
||||
vx_kernel_initialize_f initializer = _conv1d_ovxlib_initializer;
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
w_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
b_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = CONV1D_OVXLIB_HASH_KEY( in_dtype, w_dtype, b_dtype, out_dtype, kernel_type );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_def_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_tensor_t* _create_new_bias_tensor
|
||||
(
|
||||
vsi_nn_graph_t *graph,
|
||||
vsi_nn_tensor_t *input,
|
||||
vsi_nn_tensor_t *weight,
|
||||
vsi_nn_tensor_t *bias
|
||||
)
|
||||
{
|
||||
vsi_nn_tensor_t * new_bias = NULL;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
int32_t *new_bias_data_ptr = NULL;
|
||||
uint8_t *weight_data = NULL;
|
||||
int32_t *bias_data = NULL;
|
||||
uint32_t i, j;
|
||||
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
weight_data = vsi_nn_ConvertTensorToData(graph, weight);
|
||||
|
||||
if (bias == NULL)
|
||||
{
|
||||
memcpy(&attr, &weight->attr, sizeof(vsi_nn_tensor_attr_t));
|
||||
attr.dim_num = 2;
|
||||
attr.size[0] = weight->attr.size[2];
|
||||
attr.size[1] = 1;
|
||||
if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
|
||||
{
|
||||
attr.dtype.scale = input->attr.dtype.scale * weight->attr.dtype.scale;
|
||||
attr.dtype.zero_point = 0;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(&attr, &bias->attr, sizeof(vsi_nn_tensor_attr_t));
|
||||
if (attr.dim_num == 1)
|
||||
{
|
||||
attr.size[1] = 1;
|
||||
attr.dim_num = 2;
|
||||
}
|
||||
bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias);
|
||||
}
|
||||
|
||||
new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t));
|
||||
memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]);
|
||||
|
||||
if (input->attr.dtype.zero_point != 0)
|
||||
{
|
||||
for (i = 0; i < weight->attr.size[2]; i++)
|
||||
{
|
||||
uint8_t *weight_ptr = weight_data + i * weight->attr.size[0] * weight->attr.size[1];
|
||||
for (j = 0; j < weight->attr.size[0] * weight->attr.size[1]; j++)
|
||||
{
|
||||
new_bias_data_ptr[i] += -((int32_t)weight_ptr[j] - weight->attr.dtype.zero_point) \
|
||||
* input->attr.dtype.zero_point;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (bias_data != NULL)
|
||||
{
|
||||
for (i = 0; i < attr.size[0]; i++)
|
||||
{
|
||||
new_bias_data_ptr[i] += bias_data[i];
|
||||
}
|
||||
}
|
||||
|
||||
new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr);
|
||||
|
||||
vsi_nn_safe_free( new_bias_data_ptr );
|
||||
vsi_nn_safe_free( bias_data );
|
||||
vsi_nn_safe_free( weight_data );
|
||||
|
||||
return new_bias;
|
||||
}
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_CONV1D_OVXLIB_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t j = 0;
|
||||
_internal_kernel_e kernel_type = NORMAL;
|
||||
|
||||
int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
|
||||
int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" );
|
||||
int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
|
||||
int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" );
|
||||
int32_t overflow_policy = vsi_nn_kernel_param_get_int32( params, "overflow_policy" );
|
||||
vsi_nn_tensor_t *in_tensors[3] = {NULL};
|
||||
vsi_nn_tensor_t *new_bias = NULL;
|
||||
|
||||
if (VX_CONVERT_POLICY_SATURATE == overflow_policy)
|
||||
{
|
||||
overflow_policy = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
overflow_policy = 0;
|
||||
}
|
||||
|
||||
if ( 1 == stride )
|
||||
{
|
||||
if ( 3 == inputs[1]->attr.size[0] )
|
||||
{
|
||||
if (2 == dilation || 4 == dilation)
|
||||
{
|
||||
kernel_type = K3_S1_D2_D4;
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel_type = K3_S1;
|
||||
}
|
||||
}
|
||||
else if ( 1024 == inputs[1]->attr.size[0] )
|
||||
{
|
||||
if (inputs[0]->attr.size[0] < 65535)
|
||||
{
|
||||
kernel_type = K1024_SMALL;
|
||||
}
|
||||
else if (0 == pad_front && 0 == pad_end)
|
||||
{
|
||||
kernel_type = K1024_LARGE;
|
||||
}
|
||||
else
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (1024 == inputs[1]->attr.size[0])
|
||||
{
|
||||
new_bias = _create_new_bias_tensor(graph, inputs[0], inputs[1], inputs[2]);
|
||||
in_tensors[0] = inputs[0];
|
||||
in_tensors[1] = inputs[1];
|
||||
in_tensors[2] = new_bias;
|
||||
}
|
||||
else
|
||||
{
|
||||
in_tensors[0] = inputs[0];
|
||||
in_tensors[1] = inputs[1];
|
||||
in_tensors[2] = inputs[2];
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, kernel_type );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
if( pad_front != 0 || pad_end != 0)
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U8 = (uint8_t)(inputs[0]->attr.dtype.zero_point);
|
||||
status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
}
|
||||
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _CONV1D_OVXLIB_PARAM_NUM,
|
||||
in_tensors, input_num, outputs, output_num );
|
||||
j = (int32_t)(input_num + output_num);
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &stride );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_front );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_end );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &dilation );
|
||||
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &overflow_policy );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _CONV1D_OVXLIB_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[--j] );
|
||||
}
|
||||
}
|
||||
|
||||
if (new_bias)
|
||||
{
|
||||
vsi_nn_ReleaseTensor(&new_bias);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( conv1d_ovxlib, _setup )
|
||||
|
||||
|
|
@ -42,28 +42,44 @@ __BEGIN_DECLS
|
|||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8")
|
||||
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8_blk2")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8_blk2")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16_blk2")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16_blk2")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16_blk2")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16_blk2")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8_blk2")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16_blk2")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16_blk2")
|
||||
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8_blk2")
|
||||
|
||||
#define KERNEL_SOURCE_1 "depth2space_crd"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _quant_type) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_quant_type << 8))
|
||||
#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _blk_size) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_blk_size << 8))
|
||||
|
||||
#define TENSOR_DEPTH2SPACE_CRD_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 0), \
|
||||
VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE, \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 1), \
|
||||
VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE##_BLK2, \
|
||||
SOURCE },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
|
|
@ -80,6 +96,17 @@ static const struct {
|
|||
TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_KERNELS(U8, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, U8, KERNEL_SOURCE_1)
|
||||
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(U8, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I8, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I16, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I8, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I16, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(U8, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, U8, KERNEL_SOURCE_1)
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -118,9 +145,10 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
|
|||
int32_t output_height = 0;
|
||||
int32_t output_chn = 0;
|
||||
int32_t src0ZP = 0;
|
||||
float src0Scale = 0;
|
||||
float src0Scale = 1.0f;
|
||||
int32_t dstZP = 0;
|
||||
float dstScale = 0;
|
||||
float dstScale = 1.0f;
|
||||
int32_t block_size = 0;
|
||||
|
||||
uint32_t pack_key = 0;
|
||||
|
||||
|
|
@ -128,12 +156,15 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
|
|||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
src0ZP = attr[0]->asymm.zero_point;
|
||||
src0Scale = attr[0]->asymm.scale;
|
||||
dstZP = attr[1]->asymm.zero_point;
|
||||
dstScale = attr[1]->asymm.scale;
|
||||
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
src0ZP = attr[0]->asymm.zero_point;
|
||||
src0Scale = attr[0]->asymm.scale;
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
|
|
@ -143,27 +174,35 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
|
|||
{
|
||||
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
src0ZP = 0;
|
||||
}
|
||||
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
src0Scale = 1;
|
||||
src0ZP = 0;
|
||||
}
|
||||
|
||||
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
dstZP = attr[1]->asymm.zero_point;
|
||||
dstScale = attr[1]->asymm.scale;
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
dstScale = (float)((int64_t)1 << attr[1]->dfp.fl);
|
||||
dstScale = (1.0f / (float)((int64_t)1 << attr[1]->dfp.fl));
|
||||
}
|
||||
else
|
||||
{
|
||||
dstScale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
|
||||
dstScale = (float)((int64_t)1 << -attr[1]->dfp.fl);
|
||||
}
|
||||
dstScale = 1.0f/dstScale;
|
||||
dstZP = 0;
|
||||
}
|
||||
else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
dstScale = 1;
|
||||
dstZP = 0;
|
||||
}
|
||||
|
||||
output_dims = (uint32_t)attr[1]->shape->size;
|
||||
|
|
@ -179,6 +218,17 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
|
|||
shaderParam.global_size[1] = output_height;
|
||||
shaderParam.global_size[2] = output_chn;
|
||||
|
||||
if (block_size == 2)
|
||||
{
|
||||
shaderParam.global_scale[0] = 16;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = gpu_align_p2((output_width + shaderParam.global_scale[0] - 1)
|
||||
/ shaderParam.global_scale[0], 4);
|
||||
shaderParam.global_size[1] = output_height;
|
||||
shaderParam.global_size[2] = output_chn;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
|
|
@ -202,6 +252,43 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
|
|||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniU8MulAndPostShift_ExLo_2x8 = {{
|
||||
0xdddddddd, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x19111810, 0x1b131a12, // ABin
|
||||
0x11111111, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00005600, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniU8MulAndPostShift_ExHi_2x8 = {{
|
||||
0xdddddddd, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x1d151c14, 0x1f171e16, // ABin
|
||||
0x11111111, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniDepth2SpaceF16Blk2_lo_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x10101010, // ASelt
|
||||
0x01010000, 0x03030202, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniDepth2SpaceF16Blk2_hi_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x10101010, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
case _PACK_SELECT_KEY( U8, F16):
|
||||
|
|
@ -213,14 +300,25 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
|
|||
case _PACK_SELECT_KEY( U8, U8):
|
||||
case _PACK_SELECT_KEY( I8, I8):
|
||||
case _PACK_SELECT_KEY( I16, I16):
|
||||
case _PACK_SELECT_KEY( F16, F16):
|
||||
{
|
||||
gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
|
||||
multAndoutZP0[0] = (uint32_t)(M0);
|
||||
multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
|
||||
|
||||
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
|
||||
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_ExLo_2x8, postShift );
|
||||
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_ExHi_2x8, postShift );
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8MulAndPostShift_ExLo_2x8", &uniU8MulAndPostShift_ExLo_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniU8MulAndPostShift_ExHi_2x8", &uniU8MulAndPostShift_ExHi_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniDepth2SpaceF16Blk2_lo_2x8", &uniDepth2SpaceF16Blk2_lo_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniDepth2SpaceF16Blk2_hi_2x8", &uniDepth2SpaceF16Blk2_hi_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
|
|
@ -256,7 +354,8 @@ static vsi_status _query_kernel
|
|||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel,
|
||||
const vsi_nn_kernel_param_t * params
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
int32_t blk_flg
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
|
@ -268,16 +367,16 @@ static vsi_status _query_kernel
|
|||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, 0 );
|
||||
key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, blk_flg );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ )
|
||||
{
|
||||
if( depth2space_crd_map[i].key == key )
|
||||
if ( depth2space_crd_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( i < _cnt_of_array(depth2space_crd_map) )
|
||||
if ( i < _cnt_of_array(depth2space_crd_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", depth2space_crd_map[i].function_name );
|
||||
kernel->info.parameters = _depth2space_crd_kernel_param_def;
|
||||
|
|
@ -310,18 +409,19 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t tmp_params[_DEPTH2SPACE_CRD_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
|
||||
int32_t blk_flg = block_size == 2 ? 1 : 0;
|
||||
|
||||
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, params );
|
||||
if( VSI_SUCCESS == status)
|
||||
status = _query_kernel( inputs, outputs, kernel, params, blk_flg);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if( node )
|
||||
if ( node )
|
||||
{
|
||||
vsi_nn_kernel_node_pack_io( tmp_params, _DEPTH2SPACE_CRD_PARAM_NUM, inputs, 1, outputs, 1 );
|
||||
tmp_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
|
||||
|
|
|
|||
|
|
@ -717,12 +717,13 @@ static vsi_nn_kernel_node_t _setup
|
|||
int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" );
|
||||
int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
|
||||
int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" );
|
||||
int32_t batch = inputs[0]->attr.size[2];
|
||||
_internal_kernel_size_e ks = KN;
|
||||
|
||||
if (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type)
|
||||
if ( (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type)
|
||||
&& (VSI_NN_TYPE_UINT8 == inputs[1]->attr.dtype.vx_type)
|
||||
&& (NULL == inputs[2] || VSI_NN_TYPE_INT32 == inputs[2]->attr.dtype.vx_type)
|
||||
&& (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type)))
|
||||
&& (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type))) || batch > 1)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
|
@ -769,18 +770,27 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);
|
||||
|
||||
if( VSI_SUCCESS == status)
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if( node )
|
||||
if ( node )
|
||||
{
|
||||
if( pad_front != 0 && pad_end != 0)
|
||||
if ( pad_front != 0 && pad_end != 0)
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U8 = 0;
|
||||
border.constant_value.U16 = 0;
|
||||
if (VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type &&
|
||||
VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == inputs[0]->attr.dtype.qnt_type)
|
||||
{
|
||||
border.constant_value.U8 = (uint8_t)inputs[0]->attr.dtype.zero_point;
|
||||
}
|
||||
else
|
||||
{
|
||||
border.constant_value.U8 = 0;
|
||||
border.constant_value.U16 = 0;
|
||||
}
|
||||
|
||||
status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ typedef enum
|
|||
UNARY_NEG,
|
||||
UNARY_HSIGMOID,
|
||||
UNARY_MISH,
|
||||
UNARY_ROUND,
|
||||
} unary_type_e;
|
||||
|
||||
/*
|
||||
|
|
@ -82,6 +83,7 @@ typedef enum
|
|||
#define NEG_OPERATION neg
|
||||
#define HSIGMOID_OPERATION hard_sigmoid
|
||||
#define MISH_OPERATION mish
|
||||
#define ROUND_OPERATION round
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
|
|
@ -248,6 +250,30 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, I8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16 , KERNEL_SOURCE_2D)
|
||||
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, F16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, I16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, U8 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, I8 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16, I16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16, F16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, U8 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, F16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8, I8 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8, F16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_3D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, U8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, I16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, I8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_2D)
|
||||
};
|
||||
|
||||
#undef SIN_OPERATION
|
||||
|
|
@ -257,6 +283,7 @@ static const struct {
|
|||
#undef NEG_OPERATION
|
||||
#undef HSIGMOID_OPERATION
|
||||
#undef MISH_OPERATION
|
||||
#undef ROUND_OPERATION
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
|
|
@ -375,6 +402,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
|
|||
case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
|
|
@ -653,6 +681,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
|
||||
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -0,0 +1,428 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define HASH_UNARY_KEY(_input_type, _output_type, _image_2d) \
|
||||
( (_input_type << 12) | (_output_type << 4) | (_image_2d))
|
||||
|
||||
#define KERNEL_SOURCE "erf",
|
||||
|
||||
#define HASH_UNARY_SH_KERNEL_NAME( SRC_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.erf_"#SRC_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define TENSOR_UNARY_KERNELS(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \
|
||||
HASH_UNARY_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE), \
|
||||
KERNEL_SOURCE },
|
||||
|
||||
#define HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.erf_"#SRC_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
#define TENSOR_UNARY_KERNELS_2D(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \
|
||||
HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE), \
|
||||
KERNEL_SOURCE },
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _erf_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_UNARY_KERNELS(F16, F16 )
|
||||
TENSOR_UNARY_KERNELS(F16, I16 )
|
||||
TENSOR_UNARY_KERNELS(F16, U8 )
|
||||
TENSOR_UNARY_KERNELS(F16, I8 )
|
||||
TENSOR_UNARY_KERNELS(I16, I16 )
|
||||
TENSOR_UNARY_KERNELS(I16, F16 )
|
||||
TENSOR_UNARY_KERNELS(U8, U8 )
|
||||
TENSOR_UNARY_KERNELS(U8, F16 )
|
||||
TENSOR_UNARY_KERNELS(I8, I8 )
|
||||
TENSOR_UNARY_KERNELS(I8, F16 )
|
||||
TENSOR_UNARY_KERNELS(BF16, BF16)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(F16, F16 )
|
||||
TENSOR_UNARY_KERNELS_2D(F16, I16 )
|
||||
TENSOR_UNARY_KERNELS_2D(F16, U8 )
|
||||
TENSOR_UNARY_KERNELS_2D(F16, I8 )
|
||||
TENSOR_UNARY_KERNELS_2D(I16, I16 )
|
||||
TENSOR_UNARY_KERNELS_2D(I16, F16 )
|
||||
TENSOR_UNARY_KERNELS_2D(U8, U8 )
|
||||
TENSOR_UNARY_KERNELS_2D(U8, F16 )
|
||||
TENSOR_UNARY_KERNELS_2D(I8, I8 )
|
||||
TENSOR_UNARY_KERNELS_2D(I8, F16 )
|
||||
TENSOR_UNARY_KERNELS_2D(BF16, BF16)
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _erf_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _ERF_PARAM_NUM _cnt_of_array( _erf_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_erf_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL };
|
||||
vsi_int_array_t * out_shape = NULL;
|
||||
float inputScale = 1.0f;
|
||||
float inputTail = 0;
|
||||
float outputScale = 1.0f;
|
||||
float outputZP = 0;
|
||||
uint32_t pack_key;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
inputScale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
inputScale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
inputScale = attr[0]->asymm.scale;
|
||||
inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
|
||||
}
|
||||
|
||||
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
outputScale = (float)((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
outputScale = (float)1.0f / attr[1]->asymm.scale;
|
||||
outputZP = (float)attr[1]->asymm.zero_point;
|
||||
}
|
||||
|
||||
#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \
|
||||
( ( IN_TYPE << 16) | ( OUT_TYPE << 8))
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
|
||||
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
switch ( pack_key )
|
||||
{
|
||||
case _PACK_SELECT_KEY( BF16, BF16 ):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
{
|
||||
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractInteger_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniDatatoFp32Part0_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"inputScale", &inputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"inputTail", &inputTail );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"outputScale", &outputScale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"outputZP", &outputZP );
|
||||
|
||||
if (attr[1]->dtype == F16)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtract8Data_2x8", &uniExtractHalf8_2x8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
#undef _PACK_SELECT_KEY
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(attr[0]);
|
||||
SAFE_FREE_TENSOR_ATTR(attr[1]);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
|
||||
return status;
|
||||
} /* _erf_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool image_2d
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _erf_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _erf_kernel_map );
|
||||
vx_param_description_t * param_def = _erf_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _erf_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_UNARY_KEY( in_dtype, out_dtype, image_2d );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _erf_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
|
||||
int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
|
||||
int32_t new_rank = 0;
|
||||
vsi_bool image_2d = FALSE;
|
||||
vsi_bool ret = FALSE;
|
||||
|
||||
ret = vsi_nn_kernel_optimize_element_shape(
|
||||
(int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
shape, &new_rank );
|
||||
if ( ret )
|
||||
{
|
||||
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], (uint32_t*)shape, new_rank );
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], (uint32_t*)shape, new_rank );
|
||||
}
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size,
|
||||
rs_tensors[0]->attr.dim_num ) )
|
||||
{
|
||||
goto OnError;
|
||||
}
|
||||
|
||||
image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1);
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, image_2d );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
|
||||
rs_tensors, 1, &rs_tensors[1], 1 );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (rs_tensors[0])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[0] );
|
||||
}
|
||||
|
||||
if (rs_tensors[1])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[1] );
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( erf, _setup )
|
||||
|
|
@ -64,39 +64,60 @@ __BEGIN_DECLS
|
|||
#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0")
|
||||
#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0")
|
||||
|
||||
#define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_array")
|
||||
#define VX_KERNEL_NAME_GATHER_ARRAY_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_array")
|
||||
#define VX_KERNEL_NAME_GATHER_ARRAY_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_array")
|
||||
#define VX_KERNEL_NAME_GATHER_ARRAY_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_F16toF16_array")
|
||||
|
||||
#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_axis0_array")
|
||||
#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_axis0_array")
|
||||
#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_axis0_array")
|
||||
#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_F16toF16_axis0_array")
|
||||
|
||||
#define KERNEL_SOURCE_1 "gather"
|
||||
#define KERNEL_SOURCE_2 "gather_mix"
|
||||
#define KERNEL_SOURCE_3 "gather_array"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0) \
|
||||
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0))
|
||||
#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0, _is_max) \
|
||||
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0 << 4) | (_is_max))
|
||||
|
||||
#define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \
|
||||
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
|
||||
VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1), \
|
||||
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \
|
||||
VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \
|
||||
VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_GATHER_AXIS0_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1), \
|
||||
VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
|
||||
SOURCE },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} gather_map[] =
|
||||
{
|
||||
TENSOR_GATHER_KERNELS(U8, I32, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_KERNELS(I8, I32, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_KERNELS(I16, I32, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_KERNELS(F16, I32, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_KERNELS(I8, I32, F16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(I16, I32, F16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(F16, I32, I8, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(F16, I32, I16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(U8, I32, F16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(F16, I32, U8, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(U8, I32, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_KERNELS(I8, I32, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_KERNELS(I16, I32, I16, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_KERNELS(F16, I32, F16, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_KERNELS(I8, I32, F16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(I16, I32, F16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(F16, I32, I8, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(F16, I32, I16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(U8, I32, F16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_KERNELS(F16, I32, U8, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_AXIS0_KERNELS(U8, I32, U8, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_AXIS0_KERNELS(I8, I32, I8, KERNEL_SOURCE_1)
|
||||
TENSOR_GATHER_AXIS0_KERNELS(I16, I32, I16, KERNEL_SOURCE_1)
|
||||
|
|
@ -107,6 +128,14 @@ static const struct {
|
|||
TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_AXIS0_KERNELS(U8, I32, F16, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_AXIS0_KERNELS(F16, I32, U8, KERNEL_SOURCE_2)
|
||||
TENSOR_GATHER_ARRAY_KERNELS(U8, I32, U8, KERNEL_SOURCE_3)
|
||||
TENSOR_GATHER_ARRAY_KERNELS(I8, I32, I8, KERNEL_SOURCE_3)
|
||||
TENSOR_GATHER_ARRAY_KERNELS(I16, I32, I16, KERNEL_SOURCE_3)
|
||||
TENSOR_GATHER_ARRAY_KERNELS(F16, I32, F16, KERNEL_SOURCE_3)
|
||||
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(U8, I32, U8, KERNEL_SOURCE_3)
|
||||
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I8, I32, I8, KERNEL_SOURCE_3)
|
||||
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I16, I32, I16, KERNEL_SOURCE_3)
|
||||
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(F16, I32, F16, KERNEL_SOURCE_3)
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -129,7 +158,8 @@ static vsi_status get_gather_tensor_reshape_size
|
|||
vsi_nn_tensor_t ** inputs,
|
||||
int32_t sizes[VSI_NN_MAX_DIM_NUM],
|
||||
uint32_t block_size,
|
||||
uint32_t idxFlg
|
||||
uint32_t idxFlg,
|
||||
int32_t* arrayFlg
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
|
@ -157,12 +187,13 @@ static vsi_status get_gather_tensor_reshape_size
|
|||
}
|
||||
else
|
||||
{
|
||||
if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
|
||||
sizes[0] = block_size;
|
||||
sizes[1] = elementCnt / block_size;
|
||||
if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH)
|
||||
{
|
||||
sizes[0] = block_size;
|
||||
sizes[1] = elementCnt / block_size;
|
||||
status = VSI_SUCCESS;
|
||||
arrayFlg[0] = 1;
|
||||
}
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
#undef VSI_NN_MAX_IMAGE_WIDTH
|
||||
|
||||
|
|
@ -535,10 +566,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( I16, I16):
|
||||
case _PACK_SELECT_KEY( I8, I8):
|
||||
case _PACK_SELECT_KEY( U8, U8):
|
||||
case _PACK_SELECT_KEY( F16, F16):
|
||||
case _PACK_SELECT_KEY( I16, I16):
|
||||
case _PACK_SELECT_KEY( I8, I8):
|
||||
case _PACK_SELECT_KEY( U8, U8):
|
||||
case _PACK_SELECT_KEY( F16, F16):
|
||||
case _PACK_SELECT_KEY( BF16, BF16):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtraCopyDpKeepinEvis_2x8", &uniExtraCopyDpKeepinEvis_2x8 );
|
||||
|
|
@ -583,7 +615,8 @@ static vsi_status _query_kernel
|
|||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
int32_t axis
|
||||
int32_t axis,
|
||||
int32_t is_array
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
|
|
@ -595,7 +628,16 @@ static vsi_status _query_kernel
|
|||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis );
|
||||
if (input0_dtype == BF16)
|
||||
{
|
||||
input0_dtype = F16;
|
||||
}
|
||||
if (output_dtype == BF16)
|
||||
{
|
||||
output_dtype = F16;
|
||||
}
|
||||
|
||||
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array);
|
||||
|
||||
for( i = 0; i < _cnt_of_array(gather_map); i ++ )
|
||||
{
|
||||
|
|
@ -640,6 +682,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
#define VSI_NN_MAX_BLOCK_SIZE (65536)
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
|
@ -649,21 +692,23 @@ static vsi_nn_kernel_node_t _setup
|
|||
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" );
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t axis0_flg = 0;
|
||||
int32_t is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
|
||||
|
||||
if (axis == 0)
|
||||
{
|
||||
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0);
|
||||
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
|
||||
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0);
|
||||
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0, &is_array);
|
||||
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
|
||||
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0, &is_array);
|
||||
axis0_flg = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
|
||||
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
|
||||
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
|
||||
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0, &is_array);
|
||||
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
|
||||
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &is_array);
|
||||
axis0_flg = 0;
|
||||
}
|
||||
#undef VSI_NN_MAX_BLOCK_SIZE
|
||||
if (status != VSI_SUCCESS)
|
||||
{
|
||||
return NULL;
|
||||
|
|
@ -675,7 +720,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, params, axis0_flg);
|
||||
status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array);
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
|
|||
|
|
@ -387,6 +387,15 @@ static vsi_status _query_kernel
|
|||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
if (input0_dtype == BF16)
|
||||
{
|
||||
input0_dtype = F16;
|
||||
}
|
||||
if (output_dtype == BF16)
|
||||
{
|
||||
output_dtype = F16;
|
||||
}
|
||||
|
||||
if(coord_dim == 1)
|
||||
{
|
||||
coord_type = _1D;
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -53,6 +53,10 @@ typedef enum
|
|||
#define KERNEL_SOURCE_2 "instance_normalization_u8"
|
||||
#define KERNEL_SOURCE_3 "instance_normalization_i16"
|
||||
#define KERNEL_SOURCE_4 "instance_normalization_f16"
|
||||
#define KERNEL_SOURCE_5 "instance_normalization_u8_f16"
|
||||
#define KERNEL_SOURCE_6 "instance_normalization_scale_f32"
|
||||
#define KERNEL_SOURCE_7 "instance_normalization_scale_f32_f16"
|
||||
#define KERNEL_SOURCE_8 "instance_normalization_scale_f32_bf16"
|
||||
|
||||
#define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE)
|
||||
|
|
@ -66,6 +70,12 @@ typedef enum
|
|||
#define HASH_INSTANCENORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
#define HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE)
|
||||
|
||||
#define HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D")
|
||||
|
||||
// Add kernel hashtable here
|
||||
// mean vari
|
||||
#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \
|
||||
|
|
@ -82,19 +92,29 @@ typedef enum
|
|||
SOURCE },
|
||||
|
||||
// normalization
|
||||
#define HASH_INSTANCENORM_KEY(_input0_type, _output_type, _reshape_flag) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
|
||||
#define HASH_INSTANCENORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \
|
||||
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4))
|
||||
|
||||
#define TENSOR_INSTANCENORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
|
||||
{ HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \
|
||||
HASH_INSTANCENORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_INSTANCENORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 1), \
|
||||
{ HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \
|
||||
HASH_INSTANCENORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_INSTANCENORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 0), \
|
||||
HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_INSTANCENORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 1), \
|
||||
HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
|
|
@ -113,6 +133,8 @@ static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] =
|
|||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_4 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( BF16, F32, KERNEL_SOURCE_8 )
|
||||
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( BF16, F32, KERNEL_SOURCE_8 )
|
||||
};
|
||||
|
||||
static const _kernel_map_type _instancenorm_kernel_map[] =
|
||||
|
|
@ -125,8 +147,8 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
|
|||
|
||||
TENSOR_INSTANCENORM_KERNELS( U8, U8, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_2 )
|
||||
TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_5 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 )
|
||||
|
||||
TENSOR_INSTANCENORM_KERNELS( I16, I16, KERNEL_SOURCE_3 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 )
|
||||
|
|
@ -135,6 +157,21 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
|
|||
|
||||
TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_4 )
|
||||
TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 )
|
||||
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_6 )
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_6 )
|
||||
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_6 )
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_6 )
|
||||
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_6 )
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 )
|
||||
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_7 )
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_7 )
|
||||
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_8 )
|
||||
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_8 )
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -254,7 +291,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
|
|||
{
|
||||
shaderParam.global_size[0] = (width + 255) / 256 * 16;
|
||||
}
|
||||
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
||||
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16)
|
||||
{
|
||||
shaderParam.global_size[0] = (width + 127) / 128 * 16;
|
||||
}
|
||||
|
|
@ -350,6 +387,32 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
|
|||
status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
else if (attr[0]->dtype == BF16)
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
|
|
@ -385,15 +448,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
|
||||
vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL};
|
||||
vsi_int_array_t * input_shape = NULL;
|
||||
float scaleIn = 1.0f;
|
||||
float scaleOut = 1.0f;
|
||||
float reScaleOut_u8 = 1.0f;
|
||||
float scale_inOut = 1.0f;
|
||||
int32_t output_zp = 0;
|
||||
int32_t input_zp = 0;
|
||||
float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1;
|
||||
float dimRatio = 0;
|
||||
vx_uint32 group_num = 0;
|
||||
vx_int32 height = 0, width = 0, chn = 0;
|
||||
|
|
@ -401,10 +462,12 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
|
||||
attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rsFlg);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
|
@ -420,43 +483,39 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
|
||||
}
|
||||
else
|
||||
{
|
||||
in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
|
||||
}
|
||||
input_zp = 0;
|
||||
}
|
||||
|
||||
if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
output_zp = attr[2]->asymm.zero_point;
|
||||
scaleOut = attr[2]->asymm.scale;
|
||||
reScaleOut_u8 = 1 / scaleOut;
|
||||
output_zp = attr[3]->asymm.zero_point;
|
||||
scaleOut = attr[3]->asymm.scale;
|
||||
scaleOut = 1 / scaleOut;
|
||||
}
|
||||
else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
else if (attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
if (attr[2]->dfp.fl > 0)
|
||||
if (attr[3]->dfp.fl > 0)
|
||||
{
|
||||
out_scale_fl = (float)((int64_t)1 << attr[2]->dfp.fl);
|
||||
scaleOut = (float)((int64_t)1 << attr[3]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
|
||||
scaleOut = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
|
||||
}
|
||||
output_zp = 0;
|
||||
}
|
||||
|
||||
if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
&& (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP))
|
||||
{
|
||||
inOut_fl_scale = in_scale_fl * out_scale_fl;
|
||||
}
|
||||
scale_inOut = scaleIn * scaleOut;
|
||||
|
||||
width = input_shape->data[0];
|
||||
height = input_shape->data[1];
|
||||
chn = attr[1]->shape->data[1];
|
||||
chn = attr[2]->shape->data[1];
|
||||
if (rsFlg)
|
||||
{
|
||||
height = height / chn;
|
||||
|
|
@ -467,7 +526,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
group_num = (width + 255) / 256;
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
||||
if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16)
|
||||
{
|
||||
shaderParam.global_scale[0] = 8;
|
||||
group_num = (width + 127) / 128;
|
||||
|
|
@ -630,23 +689,52 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
uint32_t pack_key = 0;
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \
|
||||
(IN0_TYPE | (OUT_TYPE << 8))
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
|
||||
uint32_t pack_key = 0;
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
|
||||
(IN0_TYPE | (IN1_TYPE << 8) | (OUT_TYPE << 16))
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[3]->dtype );
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
case _PACK_SELECT_KEY( I8, I8 ):
|
||||
case _PACK_SELECT_KEY( I8, F16 ):
|
||||
case _PACK_SELECT_KEY( I8, F16, I8 ):
|
||||
case _PACK_SELECT_KEY( I8, F16, F16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
|
||||
&uniConvertInt32toUint8_2x8);
|
||||
|
|
@ -658,15 +746,42 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
&uniConvertTrdUint8Fp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4",
|
||||
&uniConvertFthUint8Fp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
|
||||
&uniConvertHalfToFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn);
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, U8 ):
|
||||
case _PACK_SELECT_KEY( U8, F16 ):
|
||||
case _PACK_SELECT_KEY( U8, F16, U8 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
|
||||
&uniConvertInt32toUint8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
|
||||
&uniConvert1stUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
|
||||
&uniConvert2ndUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
|
||||
&uniConvert3rdUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
|
||||
&uniConvert4thUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, F32, U8 ):
|
||||
case _PACK_SELECT_KEY( I8, F32, I8 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
|
||||
&uniConvertInt32toUint8_2x8);
|
||||
|
|
@ -679,37 +794,85 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
|
||||
&uniConvert4thUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8);
|
||||
|
||||
scale_inOut = reScaleOut_u8 * scaleIn;
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( I16, I16 ):
|
||||
case _PACK_SELECT_KEY( I16, F16 ):
|
||||
case _PACK_SELECT_KEY( U8, F16, F16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
|
||||
&uniConvert1stUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
|
||||
&uniConvert2ndUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
|
||||
&uniConvert3rdUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
|
||||
&uniConvert4thUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
|
||||
&uniConvertHalfToFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( I16, F16, I16 ):
|
||||
case _PACK_SELECT_KEY( I16, F16, F16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
|
||||
&uniConvertInt16Fp32Fst_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
|
||||
&uniConvertInt16Fp32Secd_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn);
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
|
||||
&uniConvertInt32toInt16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
|
||||
&uniConvertHalfToFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, F16 ):
|
||||
case _PACK_SELECT_KEY( I16, F32, I16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
|
||||
&uniConvertInt16Fp32Fst_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
|
||||
&uniConvertInt16Fp32Secd_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
|
||||
&uniConvertInt32toInt16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, F16, F16 ):
|
||||
case _PACK_SELECT_KEY( F16, F32, F16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4",
|
||||
&uniConvertEndInt16Fp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
|
||||
&uniConvertHalfToFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( BF16, F32, BF16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
|
|
@ -736,6 +899,11 @@ OnError:
|
|||
vsi_nn_kernel_tensor_attr_release( &attr[2] );
|
||||
attr[2] = NULL;
|
||||
}
|
||||
if (attr[3])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[3] );
|
||||
attr[3] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
|
@ -826,11 +994,13 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_t tmp_node = NULL;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_dtype_e in0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e in1_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e out_dtype = U8;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
|
||||
vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
|
||||
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
|
||||
uint32_t hashkey = 0;
|
||||
int32_t i = 0;
|
||||
|
|
@ -851,29 +1021,12 @@ static vsi_nn_kernel_node_t _setup
|
|||
ikernels[i]->unique_id = kernel->unique_id;
|
||||
}
|
||||
|
||||
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
|
||||
attr.size[0] = ((inputs[0]->attr.size[0] + 255) / 256) * 4;
|
||||
|
||||
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|
||||
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
|
||||
{
|
||||
attr.size[0] = ((inputs[0]->attr.size[0] + 127) / 128) * 4;
|
||||
}
|
||||
attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
|
||||
attr.size[2] = 1;
|
||||
attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
attr.dim_num = 4;
|
||||
tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg );
|
||||
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
|
||||
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, in1_dtype, out_dtype, reshape_flg );
|
||||
|
||||
status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
|
||||
if ( VSI_SUCCESS != status )
|
||||
|
|
@ -888,22 +1041,54 @@ static vsi_nn_kernel_node_t _setup
|
|||
|
||||
if (reshape_flg)
|
||||
{
|
||||
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
shape[0] = inputs[0]->attr.size[0];
|
||||
shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2];
|
||||
shape[2] = 1;
|
||||
shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
|
||||
|
||||
shape[0] = outputs[0]->attr.size[0];
|
||||
shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2];
|
||||
shape[2] = 1;
|
||||
shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
|
||||
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
|
||||
}
|
||||
else if (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
|
||||
shape[1] = 1;
|
||||
shape[2] = inputs[0]->attr.size[2];
|
||||
shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
|
||||
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
|
||||
}
|
||||
else if (inputs[0]->attr.size[0] < inputs[0]->attr.size[1])
|
||||
{
|
||||
shape[0] = inputs[0]->attr.size[1];
|
||||
shape[1] = inputs[0]->attr.size[0];
|
||||
shape[2] = inputs[0]->attr.size[2];
|
||||
shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
|
||||
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
|
||||
}
|
||||
else
|
||||
{
|
||||
shape[0] = inputs[0]->attr.size[0];
|
||||
}
|
||||
|
||||
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
attr.size[0] = ((shape[0] + 255) / 256) * 4;
|
||||
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|
||||
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
|
||||
{
|
||||
attr.size[0] = ((shape[0] + 127) / 128) * 4;
|
||||
}
|
||||
attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
|
||||
attr.size[2] = 1;
|
||||
attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
|
||||
attr.dim_num = 4;
|
||||
tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
|
||||
|
||||
if (inputs[1]->attr.dim_num < 2)
|
||||
{
|
||||
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
shape[0] = inputs[1]->attr.size[0];
|
||||
shape[1] = 1;
|
||||
shape[2] = 1;
|
||||
|
|
@ -912,7 +1097,6 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
if (inputs[2]->attr.dim_num < 2)
|
||||
{
|
||||
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
|
||||
shape[0] = inputs[2]->attr.size[0];
|
||||
shape[1] = 1;
|
||||
shape[2] = 1;
|
||||
|
|
@ -925,7 +1109,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
if (tmp_node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
if (reshape_flg)
|
||||
if (rs_input)
|
||||
{
|
||||
mean_vari_node_params[index++] = rs_input;
|
||||
vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index],
|
||||
|
|
@ -967,7 +1151,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
if (node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
if (reshape_flg)
|
||||
if (rs_input)
|
||||
{
|
||||
node_params[index++] = rs_input;
|
||||
}
|
||||
|
|
@ -992,7 +1176,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
|
||||
}
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
|
||||
if (reshape_flg)
|
||||
if (rs_output)
|
||||
{
|
||||
node_params[index++] = rs_output;
|
||||
}
|
||||
|
|
@ -1034,9 +1218,12 @@ final:
|
|||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_gamma );
|
||||
}
|
||||
if (reshape_flg)
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_output );
|
||||
}
|
||||
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
|
||||
|
|
|
|||
|
|
@ -60,6 +60,9 @@ __BEGIN_DECLS
|
|||
#define KERNEL_SOURCE_5 "layer_normalization_wh_f16"
|
||||
#define KERNEL_SOURCE_6 "layer_normalization_i16"
|
||||
#define KERNEL_SOURCE_7 "layer_normalization_wh_i16"
|
||||
#define KERNEL_SOURCE_8 "layer_normalization_scale_f32"
|
||||
#define KERNEL_SOURCE_9 "layer_normalization_scale_f32_2d"
|
||||
#define KERNEL_SOURCE_10 "layer_normalization_scale_f32_bf16"
|
||||
|
||||
|
||||
#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
|
|
@ -68,20 +71,36 @@ __BEGIN_DECLS
|
|||
#define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
#define HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE)
|
||||
|
||||
#define HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D")
|
||||
|
||||
// normalization
|
||||
#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
|
||||
#define HASH_LAYERNORM_KEY(_input0_type, _input2_type, _output_type, _reshape_flag) \
|
||||
((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | _reshape_flag)
|
||||
|
||||
#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_KERNEL), \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_KERNEL), \
|
||||
HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_2D_KERNEL), \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_2D_KERNEL), \
|
||||
HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_LAYERNORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_KERNEL), \
|
||||
HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_LAYERNORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_2D_KERNEL), \
|
||||
HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
// greater than max size
|
||||
#define HASH_SUMSQR_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE)
|
||||
|
|
@ -96,22 +115,22 @@ __BEGIN_DECLS
|
|||
CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE"_2D")
|
||||
|
||||
#define TENSOR_SUMSQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_KERNEL), \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_KERNEL), \
|
||||
HASH_SUMSQR_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_SUMSQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_2D_KERNEL), \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_2D_KERNEL), \
|
||||
HASH_SUMSQR_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_LAYERNORM_WH_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_KERNEL), \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_KERNEL), \
|
||||
HASH_LAYERNORM_WH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_LAYERNORM_WH_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \
|
||||
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \
|
||||
HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
|
|
@ -136,6 +155,17 @@ static const _kernel_map_type _layernorm_kernel_map[] =
|
|||
TENSOR_LAYERNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_2 )
|
||||
TENSOR_LAYERNORM_KERNELS( I16, I16, KERNEL_SOURCE_6 )
|
||||
TENSOR_LAYERNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 )
|
||||
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_8 )
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_9 )
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_8 )
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_9 )
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_8 )
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_9 )
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_8 )
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_9 )
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_10 )
|
||||
TENSOR_LAYERNORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_10 )
|
||||
};
|
||||
|
||||
static const _kernel_map_type _sumsqr_kernel_map[] =
|
||||
|
|
@ -295,8 +325,7 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
|
|||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = 1;
|
||||
shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
|
||||
/ shaderParam.global_scale[1], 4);
|
||||
shaderParam.global_size[1] = height;
|
||||
shaderParam.global_size[2] = chn;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
|
|
@ -424,6 +453,37 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
|
|||
0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x01050004, 0x03070206, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x05050404, 0x07070606, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractOddData_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x07050301, 0x07050301, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
uint32_t pack_key = 0;
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
|
||||
(IN0_TYPE | (IN1_TYPE << 16) | (OUT_TYPE << 8))
|
||||
|
|
@ -432,9 +492,6 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
|
|||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
switch( pack_key )
|
||||
|
|
@ -453,6 +510,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
|
|||
&uniConvert3rdUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
|
||||
&uniConvert4thUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
|
||||
&uniConvertSecFp16Fp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
|
||||
|
|
@ -481,6 +543,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
|
|||
&uniConvert3rdUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
|
||||
&uniConvert4thUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
|
||||
&uniConvertSecFp16Fp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
|
||||
|
|
@ -501,7 +568,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
|
|||
&uniConvert2ndUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
|
||||
&uniConvertInt32toUint8_2x8);
|
||||
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
|
||||
&uniConvertSecFp16Fp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
|
|
@ -510,6 +581,70 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, F32, U8 ):
|
||||
case _PACK_SELECT_KEY( F16, F32, F16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2",
|
||||
&uniFp16SumSqr_dp8x2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4",
|
||||
&uniExtractHalf4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
|
||||
&uniConvertInt32toUint8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
|
||||
&uniConvert1stUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
|
||||
&uniConvert2ndUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
|
||||
&uniConvert3rdUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
|
||||
&uniConvert4thUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( I16, F32, I16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2",
|
||||
&uniInt16SumSqr_dp8x2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
|
||||
&uniConvert1stUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
|
||||
&uniConvert2ndUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
|
||||
&uniConvertInt32toUint8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
|
||||
&UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( BF16, F32, BF16 ):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
VSI_ASSERT( FALSE );
|
||||
return VSI_FAILURE;
|
||||
|
|
@ -949,6 +1084,7 @@ static vsi_status _query_kernel
|
|||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e input2_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
uint32_t key = 0;
|
||||
int i = 0;
|
||||
|
|
@ -960,9 +1096,10 @@ static vsi_status _query_kernel
|
|||
}
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, kernel_type );
|
||||
key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, kernel_type );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ )
|
||||
{
|
||||
|
|
@ -1000,14 +1137,16 @@ static vsi_status _query_kernel_wh
|
|||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e input2_dtype = F16;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
uint32_t key = 0;
|
||||
int i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_LAYERNORM_KEY( input0_dtype, F32, is2D_sumsqr );
|
||||
key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, F32, is2D_sumsqr );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
|
||||
{
|
||||
|
|
@ -1031,7 +1170,7 @@ static vsi_status _query_kernel_wh
|
|||
}
|
||||
|
||||
|
||||
key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, is2D_wh );
|
||||
key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, is2D_wh );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
|
||||
{
|
||||
|
|
@ -1256,17 +1395,25 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
|
||||
int32_t rs_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
|
||||
int32_t wh_flg = vsi_nn_kernel_param_get_int32( params, "wh_flg" );
|
||||
int32_t optFlg = rs_flg || (outputs[0]->attr.dim_num < 3);
|
||||
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
|
||||
uint32_t *input_size = inputs[0]->attr.size;
|
||||
uint32_t dims_num = inputs[0]->attr.dim_num;
|
||||
int32_t rs_flg = 0;
|
||||
int32_t optFlg = 0;
|
||||
|
||||
if (wh_flg)
|
||||
if (input_size[0] >= GPU_TENSOR_MAX_WIDTH)
|
||||
{
|
||||
node = _setup_wh(graph, inputs, input_num, outputs, output_num, params, kernel);
|
||||
goto final;
|
||||
}
|
||||
|
||||
if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH)
|
||||
&& dims_num > 2)
|
||||
{
|
||||
rs_flg = 1;
|
||||
}
|
||||
optFlg = rs_flg || (outputs[0]->attr.dim_num < 3);
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel, optFlg);
|
||||
if (VSI_SUCCESS != status)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -241,7 +241,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
|
|||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype,
|
||||
attr[1]->dtype, attr[2]->dtype );
|
||||
|
||||
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16)
|
||||
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
|
||||
|| ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
|
||||
{
|
||||
gpu_param.global_scale[0] = 8;
|
||||
|
|
|
|||
|
|
@ -241,7 +241,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
|
|||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype,
|
||||
attr[1]->dtype, attr[2]->dtype );
|
||||
|
||||
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16)
|
||||
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
|
||||
|| ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
|
||||
{
|
||||
gpu_param.global_scale[0] = 8;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,460 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define _ONE_HOT_KERNEL_SOURCE "one_hot"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_ONE_HOT_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.one_hot_"#SRC_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, IMG_2D ) \
|
||||
(( IN_DTYPE << 9 ) | ( OUT_DTYPE << 1) | (IMG_2D))
|
||||
|
||||
#define PACK_ONE_HOT_KERNEL_3D( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \
|
||||
CVIVANTE_NAMESPACE("evis.one_hot_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||
_ONE_HOT_KERNEL_SOURCE }
|
||||
|
||||
#define PACK_ONE_HOT_KERNEL_2D( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1 ), \
|
||||
CVIVANTE_NAMESPACE("evis.one_hot_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
|
||||
_ONE_HOT_KERNEL_SOURCE }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _one_hot_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_ONE_HOT_KERNEL_3D( U8, U8 ),
|
||||
PACK_ONE_HOT_KERNEL_3D( U8, F16 ),
|
||||
PACK_ONE_HOT_KERNEL_3D( I8, I8 ),
|
||||
PACK_ONE_HOT_KERNEL_3D( I8, F16 ),
|
||||
PACK_ONE_HOT_KERNEL_3D( I16, I16 ),
|
||||
PACK_ONE_HOT_KERNEL_3D( I16, F16 ),
|
||||
PACK_ONE_HOT_KERNEL_3D( F16, F16 ),
|
||||
PACK_ONE_HOT_KERNEL_3D( F16, I16 ),
|
||||
PACK_ONE_HOT_KERNEL_3D( F16, U8 ),
|
||||
PACK_ONE_HOT_KERNEL_3D( F16, I8 ),
|
||||
|
||||
PACK_ONE_HOT_KERNEL_2D( U8, U8 ),
|
||||
PACK_ONE_HOT_KERNEL_2D( U8, F16 ),
|
||||
PACK_ONE_HOT_KERNEL_2D( I8, I8 ),
|
||||
PACK_ONE_HOT_KERNEL_2D( I8, F16 ),
|
||||
PACK_ONE_HOT_KERNEL_2D( I16, I16 ),
|
||||
PACK_ONE_HOT_KERNEL_2D( I16, F16 ),
|
||||
PACK_ONE_HOT_KERNEL_2D( F16, F16 ),
|
||||
PACK_ONE_HOT_KERNEL_2D( F16, I16 ),
|
||||
PACK_ONE_HOT_KERNEL_2D( F16, U8 ),
|
||||
PACK_ONE_HOT_KERNEL_2D( F16, I8 ),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _one_hot_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define SCALAR_INPUT_SUFFIX_SIZE (2)
|
||||
#define SCALAR_INPUT_ON_VALUE (3)
|
||||
#define SCALAR_INPUT_OFF_VALUE (4)
|
||||
#define _ONE_HOT_PARAM_NUM _cnt_of_array( _one_hot_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_one_hot_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
2, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0} // globalWorkSize: image size in thread
|
||||
};
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_int_array_t * in_shape = NULL;
|
||||
int32_t suffix_size = 0;
|
||||
int32_t depth = 0;
|
||||
int32_t input_zp = 0;
|
||||
float scaleIn = 1.0f;
|
||||
int32_t srcFixPointPos = 0;
|
||||
vsi_nn_kernel_dtype_e input_dtype = F16;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SUFFIX_SIZE], &(suffix_size));
|
||||
|
||||
in_shape = attr[0]->shape;
|
||||
depth = attr[1]->shape->data[1];
|
||||
input_dtype = attr[0]->dtype;
|
||||
|
||||
if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant)
|
||||
{
|
||||
srcFixPointPos = attr[0]->dfp.fl;
|
||||
}
|
||||
else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant)
|
||||
{
|
||||
input_zp = attr[0]->asymm.zero_point;
|
||||
scaleIn = attr[0]->asymm.scale;
|
||||
}
|
||||
|
||||
if (suffix_size == 1)
|
||||
{
|
||||
gpu_param.global_scale[0] = 4;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
|
||||
depth = attr[1]->shape->data[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
}
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(in_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = in_shape->data[1];
|
||||
|
||||
switch (input_dtype)
|
||||
{
|
||||
case I16:
|
||||
case I8:
|
||||
case F16:
|
||||
{
|
||||
gpu_dp_inst_t uniDataConvert_0_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniDataConvert_1_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractInteger_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_update_postshfit( &uniDataConvert_0_4x4, srcFixPointPos );
|
||||
gpu_dp_inst_update_postshfit( &uniDataConvert_1_4x4, srcFixPointPos );
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniDataConvert_0_4x4", &uniDataConvert_0_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniDataConvert_1_4x4", &uniDataConvert_1_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"depth", &depth );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
break;
|
||||
case U8:
|
||||
{
|
||||
gpu_dp_inst_t uniDataConvert_0_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00010000, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniDataConvert_1_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00050004, 0x00070006, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractInteger_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
float input_tail = 0 - (float)input_zp * scaleIn;
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniDataConvert_0_4x4", &uniDataConvert_0_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniDataConvert_1_4x4", &uniDataConvert_1_4x4 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"input_scale", &scaleIn );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"input_tail", &input_tail );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"depth", &depth );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(attr[0]);
|
||||
SAFE_FREE_TENSOR_ATTR(attr[1]);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
|
||||
return status;
|
||||
} /* _one_hot_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool image_2d
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _one_hot_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _one_hot_kernel_map );
|
||||
vx_param_description_t * param_def = _one_hot_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _one_hot_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = ONE_HOT_HASH_KEY( in_dtype, out_dtype, image_2d );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _one_hot_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
|
||||
int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
|
||||
int32_t i = 0;
|
||||
vsi_bool image_2d = FALSE;
|
||||
int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
|
||||
int32_t prefix_dim_size = 1;
|
||||
int32_t suffix_dim_size = 0;
|
||||
int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
|
||||
uint32_t data_u32[2] = {0};
|
||||
float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
|
||||
float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
|
||||
vsi_nn_Float32ToDtype(on_value, (uint8_t*)&data_u32[0], &outputs[0]->attr.dtype);
|
||||
vsi_nn_Float32ToDtype(off_value, (uint8_t*)&data_u32[1], &outputs[0]->attr.dtype);
|
||||
|
||||
axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;
|
||||
for (i = 0; i < axis; i++)
|
||||
{
|
||||
prefix_dim_size *= inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
suffix_dim_size = num_elements / prefix_dim_size;
|
||||
|
||||
if (suffix_dim_size == 1)
|
||||
{
|
||||
shape[0][0] = prefix_dim_size;
|
||||
shape[0][1] = 1;
|
||||
shape[1][0] = depth;
|
||||
shape[1][1] = prefix_dim_size;
|
||||
shape[1][2] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
shape[0][0] = suffix_dim_size;
|
||||
shape[0][1] = prefix_dim_size;
|
||||
shape[1][0] = suffix_dim_size;
|
||||
shape[1][1] = depth;
|
||||
shape[1][2] = prefix_dim_size;
|
||||
}
|
||||
|
||||
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], (uint32_t*)shape[0], 2 );
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], (uint32_t*)shape[1], 3 );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
|
||||
rs_tensors[1]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
image_2d = suffix_dim_size == 1;
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, image_2d );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
|
||||
&rs_tensors[0], input_num, &rs_tensors[1], output_num );
|
||||
node_params[SCALAR_INPUT_SUFFIX_SIZE] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &suffix_dim_size );
|
||||
node_params[SCALAR_INPUT_ON_VALUE] = vsi_nn_kernel_scalar_create(
|
||||
graph, U32, &data_u32[0] );
|
||||
node_params[SCALAR_INPUT_OFF_VALUE] = vsi_nn_kernel_scalar_create(
|
||||
graph, U32, &data_u32[1] );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
}
|
||||
final:
|
||||
if (rs_tensors[0])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[0] );
|
||||
}
|
||||
|
||||
if (rs_tensors[1])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[1] );
|
||||
}
|
||||
|
||||
for (i = SCALAR_INPUT_SUFFIX_SIZE; i < _ONE_HOT_PARAM_NUM; i++)
|
||||
{
|
||||
if (node_params[i])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( one_hot, _setup )
|
||||
|
||||
|
|
@ -202,8 +202,28 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
|
|||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractRtoF32_part1_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01010100, // ASelt
|
||||
0x0000000c, 0x00060003, // ABin
|
||||
0x01010000, // ASelt
|
||||
0x000f000c, 0x00050002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractRtoF32_part2_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01000000, // ASelt
|
||||
0x000b0008, 0x0001000e, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractRtoF32_part3_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x00070004, 0x000d000a, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
|
|
@ -223,7 +243,27 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
|
|||
gpu_dp_inst_t uniExtractGtoF32_part1_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01010100, // ASelt
|
||||
0x0001000d, 0x00070004, // ABin
|
||||
0x0000000d, 0x00060003, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractGtoF32_part2_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01000000, // ASelt
|
||||
0x000c0009, 0x0002000f, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractGtoF32_part3_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x00080005, 0x000e000b, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
|
|
@ -243,7 +283,27 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
|
|||
gpu_dp_inst_t uniExtractBtoF32_part1_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01010100, // ASelt
|
||||
0x0002000e, 0x00080005, // ABin
|
||||
0x0001000e, 0x00070004, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractBtoF32_part2_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01010000, // ASelt
|
||||
0x000d000a, 0x00030000, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000,
|
||||
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractBtoF32_part3_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01010101, // ASelt
|
||||
0x00090006, 0x000f000c, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
|
|
@ -358,7 +418,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
|
|||
case _PACK_SELECT_KEY( 1, 0, 0): // copy
|
||||
case _PACK_SELECT_KEY( 1, 2, 0): // copy reorder
|
||||
{
|
||||
shaderParam.global_scale[0] = 8;
|
||||
if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
|
||||
{
|
||||
shaderParam.global_scale[0] = 16;
|
||||
}
|
||||
else
|
||||
{
|
||||
shaderParam.global_scale[0] = 8;
|
||||
}
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
|
||||
|
|
@ -366,7 +433,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
|
|||
shaderParam.global_size[1] = height;
|
||||
shaderParam.global_size[2] = 1;
|
||||
|
||||
if(attr[0]->dtype == F16)
|
||||
if (attr[0]->dtype == F16)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
|
||||
}
|
||||
|
|
@ -376,10 +443,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
|
|||
}
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part0_4x4", &uniExtractRtoF32_part0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part1_4x4", &uniExtractRtoF32_part1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part2_4x4", &uniExtractRtoF32_part2_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part3_4x4", &uniExtractRtoF32_part3_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part0_4x4", &uniExtractGtoF32_part0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part1_4x4", &uniExtractGtoF32_part1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part2_4x4", &uniExtractGtoF32_part2_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part3_4x4", &uniExtractGtoF32_part3_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part0_4x4", &uniExtractBtoF32_part0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part1_4x4", &uniExtractBtoF32_part1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part2_4x4", &uniExtractBtoF32_part2_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part3_4x4", &uniExtractBtoF32_part3_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ __BEGIN_DECLS
|
|||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toF16")
|
||||
|
||||
#define KERNEL_SOURCE_1 "pre_process_yuv420_scale_u8",
|
||||
#define KERNEL_SOURCE_2 "pre_process_yuv420_copy_u8",
|
||||
|
|
@ -77,6 +78,7 @@ static const struct {
|
|||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_5)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2)
|
||||
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY, KERNEL_SOURCE_2)
|
||||
};
|
||||
|
||||
static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
|
||||
|
|
@ -155,10 +157,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
|
|||
}
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
||||
{
|
||||
shaderParam.global_scale[0] = 8;
|
||||
}
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
|
||||
|
|
@ -418,6 +416,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
|
|||
switch( attr[0]->dtype )
|
||||
{
|
||||
case U8:
|
||||
case F16:
|
||||
{
|
||||
// R
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4);
|
||||
|
|
@ -866,7 +865,7 @@ static vsi_status _query_kernel
|
|||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (enable_copy && output_dtype == U8)
|
||||
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
|
||||
{
|
||||
convert_type = COPY;
|
||||
}
|
||||
|
|
@ -890,7 +889,7 @@ static vsi_status _query_kernel
|
|||
kernel->info.parameters = vxPreProcessYuv420Kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def );
|
||||
|
||||
if (enable_copy && output_dtype == U8)
|
||||
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
|
||||
{
|
||||
kernel->info.initialize = _pre_process_yuv420_copy_initializer;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ __BEGIN_DECLS
|
|||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toU8")
|
||||
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toF16")
|
||||
|
||||
#define KERNEL_SOURCE_1 "pre_process_yuv444_scale",
|
||||
#define KERNEL_SOURCE_3 "pre_process_yuv444_scale_fp16",
|
||||
|
|
@ -75,6 +76,7 @@ static const struct {
|
|||
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1)
|
||||
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, COPY, KERNEL_SOURCE_4)
|
||||
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, F16, COPY, KERNEL_SOURCE_4)
|
||||
};
|
||||
|
||||
static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] =
|
||||
|
|
@ -145,10 +147,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
|
|||
}
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
||||
{
|
||||
shaderParam.global_scale[0] = 8;
|
||||
}
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
|
||||
|
|
@ -400,6 +398,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
|
|||
switch( attr[0]->dtype )
|
||||
{
|
||||
case U8:
|
||||
case F16:
|
||||
{
|
||||
// R
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4);
|
||||
|
|
@ -841,7 +840,7 @@ static vsi_status _query_kernel
|
|||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (enable_copy && output_dtype == U8)
|
||||
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
|
||||
{
|
||||
convert_type = COPY;
|
||||
}
|
||||
|
|
@ -865,7 +864,7 @@ static vsi_status _query_kernel
|
|||
kernel->info.parameters = vxPreProcessYuv444Kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( vxPreProcessYuv444Kernel_param_def );
|
||||
|
||||
if (enable_copy && output_dtype == U8)
|
||||
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
|
||||
{
|
||||
kernel->info.initialize = _pre_process_yuv444_copy_initializer;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,609 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
|
||||
#define KERNEL_SOURCE_1 "repeat"
|
||||
#define KERNEL_SOURCE_2 "repeat_axis1"
|
||||
|
||||
#define HASH_PREPROCESS_STARTID_SH_KERNEL_NAME \
|
||||
CVIVANTE_NAMESPACE("evis.preprocess_start_idx")
|
||||
|
||||
#define HASH_REPEAT_SH_KERNEL_1D_NAME(SRC0_TYPE) \
|
||||
CVIVANTE_NAMESPACE("evis.repeat_"#SRC0_TYPE"_1D")
|
||||
|
||||
#define HASH_REPEAT_SH_KERNEL_NAME(SRC0_TYPE, AXIS) \
|
||||
CVIVANTE_NAMESPACE("evis.repeat_"#SRC0_TYPE"_axis"#AXIS)
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_PREPROCESS_KEY(_input0_type, _output_type) \
|
||||
((_input0_type << 24) | (_output_type << 16))
|
||||
|
||||
#define HASH_REPEAT_KEY(_input0_type, _output_type, _is1d, _axis) \
|
||||
((_input0_type << 24) | (_output_type << 16) | (_is1d << 8) | _axis)
|
||||
|
||||
#define TENSOR_PREPROCESS_STARTID_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_PREPROCESS_KEY(IN0_TYPE, OUT_TYPE), \
|
||||
HASH_PREPROCESS_STARTID_SH_KERNEL_NAME, \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_REPEAT_KERNELS(IN0_TYPE, OUT_TYPE, AXIS, SOURCE) \
|
||||
{ HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 0, AXIS), \
|
||||
HASH_REPEAT_SH_KERNEL_NAME(IN0_TYPE, AXIS), \
|
||||
SOURCE },
|
||||
|
||||
#define TENSOR_REPEAT_1D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 1, 0), \
|
||||
HASH_REPEAT_SH_KERNEL_1D_NAME(IN0_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _preprocess_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_PREPROCESS_STARTID_KERNELS( I32, I32, KERNEL_SOURCE_1 )
|
||||
};
|
||||
|
||||
static const _kernel_map_type _repeat_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
TENSOR_REPEAT_KERNELS( U8, U8, 0, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_KERNELS( U8, U8, 1, KERNEL_SOURCE_2 )
|
||||
TENSOR_REPEAT_KERNELS( U8, U8, 2, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_KERNELS( I16, I16, 0, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_KERNELS( I16, I16, 1, KERNEL_SOURCE_2 )
|
||||
TENSOR_REPEAT_KERNELS( I16, I16, 2, KERNEL_SOURCE_1 )
|
||||
|
||||
TENSOR_REPEAT_1D_KERNELS( U8, U8, KERNEL_SOURCE_1 )
|
||||
TENSOR_REPEAT_1D_KERNELS( I16, I16, KERNEL_SOURCE_1 )
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _preprocess_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _REPEAT_PREPROCESS_PARAM_NUM _cnt_of_array( _preprocess_kernel_param_def )
|
||||
|
||||
static vx_param_description_t _repeat_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _REPEAT_PARAM_NUM _cnt_of_array( _repeat_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_preprocess_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
|
||||
int32_t width = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
width = attr[0]->shape->data[0];
|
||||
|
||||
shaderParam.global_scale[0] = 16;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
shaderParam.local_size[0] = 32;
|
||||
shaderParam.local_size[1] = 1;
|
||||
shaderParam.local_size[2] = 1;
|
||||
shaderParam.global_size[0] = 32;
|
||||
shaderParam.global_size[1] = 1;
|
||||
shaderParam.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniIntegralHorAcc_4x4 = {{
|
||||
0xff3f0f03, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00100000, 0x32100210, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002600, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniIntegralHorAcc_4x4", &uniIntegralHorAcc_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_repeat_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
|
||||
vsi_int_array_t * input_shape = NULL;
|
||||
int32_t height = 0, width = 0, chn = 0;
|
||||
int32_t is1d = 0;
|
||||
int32_t axis = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
width = input_shape->data[0];
|
||||
height = input_shape->data[1];
|
||||
if (height == 1 && input_shape->size == 2)
|
||||
{
|
||||
is1d = 1;
|
||||
}
|
||||
chn = input_shape->size > 2 ? input_shape->data[2] : 1;
|
||||
|
||||
if ((axis == 0 && is1d == 0) || axis == 2)
|
||||
{
|
||||
shaderParam.global_scale[0] = 16;
|
||||
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
|
||||
{
|
||||
shaderParam.global_scale[0] = 8;
|
||||
}
|
||||
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
}
|
||||
else if (is1d)
|
||||
{
|
||||
shaderParam.global_scale[0] = 1;
|
||||
shaderParam.global_scale[1] = 1;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
}
|
||||
else if (axis == 1)
|
||||
{
|
||||
shaderParam.global_scale[0] = 1;
|
||||
shaderParam.global_scale[1] = 8;
|
||||
shaderParam.global_scale[2] = 1;
|
||||
}
|
||||
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
|
||||
/ shaderParam.global_scale[0], 4);
|
||||
shaderParam.global_size[1] = (height + shaderParam.global_scale[1] - 1)
|
||||
/ shaderParam.global_scale[1];
|
||||
shaderParam.global_size[2] = chn;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniExtract1to8Short_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x00000000, 0x00000000, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000600, // AccumType, ConstantType, and PostShift
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001,
|
||||
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract1to8Short_2x8", &uniExtract1to8Short_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
|
||||
OnError:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel_preprocess,
|
||||
vsi_nn_kernel_t* kernel,
|
||||
int32_t axis
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e input1_dtype = I32;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
uint32_t key = 0;
|
||||
int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
|
||||
int i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
if (input0_dtype == F16)
|
||||
{
|
||||
input0_dtype = I16;
|
||||
}
|
||||
if (output_dtype == F16)
|
||||
{
|
||||
output_dtype = I16;
|
||||
}
|
||||
|
||||
if (input0_dtype == I8)
|
||||
{
|
||||
input0_dtype = U8;
|
||||
}
|
||||
if (output_dtype == I8)
|
||||
{
|
||||
output_dtype = U8;
|
||||
}
|
||||
|
||||
key = HASH_PREPROCESS_KEY( input1_dtype, I32 );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(_preprocess_kernel_map); i ++ )
|
||||
{
|
||||
if ( _preprocess_kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(_preprocess_kernel_map) )
|
||||
{
|
||||
snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s", _preprocess_kernel_map[i].function_name );
|
||||
kernel_preprocess->info.parameters = _preprocess_kernel_param_def;
|
||||
kernel_preprocess->info.numParams = _REPEAT_PREPROCESS_PARAM_NUM;
|
||||
kernel_preprocess->info.initialize = _preprocess_initializer;
|
||||
|
||||
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
_preprocess_kernel_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
_preprocess_kernel_map[i].source_name );
|
||||
}
|
||||
|
||||
|
||||
key = HASH_REPEAT_KEY( input0_dtype, output_dtype, is1d, axis );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(_repeat_kernel_map); i ++ )
|
||||
{
|
||||
if ( _repeat_kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(_repeat_kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _repeat_kernel_map[i].function_name );
|
||||
kernel->info.parameters = _repeat_kernel_param_def;
|
||||
kernel->info.numParams = _REPEAT_PARAM_NUM;
|
||||
kernel->info.initialize = _repeat_initializer;
|
||||
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
_repeat_kernel_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
_repeat_kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static int32_t _optimize_repeat_shape
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
int32_t* axis,
|
||||
int32_t* opt_shape_in,
|
||||
int32_t* opt_shape_out,
|
||||
int32_t* new_rank
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
|
||||
if (inputs[0]->attr.dim_num == 1)
|
||||
{
|
||||
opt_shape_in[0] = inputs[0]->attr.size[0];
|
||||
opt_shape_in[1] = 1;
|
||||
opt_shape_out[0] = outputs[0]->attr.size[0];
|
||||
opt_shape_out[1] = 1;
|
||||
new_rank[0] = 2;
|
||||
new_rank[1] = 2;
|
||||
}
|
||||
else if (axis[0] == 3)
|
||||
{
|
||||
vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank );
|
||||
if (opt_shape_in[1] == 1)
|
||||
{
|
||||
opt_shape_in[1] = inputs[0]->attr.size[3];
|
||||
opt_shape_out[0] = opt_shape_in[0];
|
||||
opt_shape_out[1] = outputs[0]->attr.size[3];
|
||||
axis[0] = 0;
|
||||
new_rank[0] = 2;
|
||||
new_rank[1] = 2;
|
||||
}
|
||||
else if (new_rank[0] == 2)
|
||||
{
|
||||
opt_shape_in[2] = inputs[0]->attr.size[3];
|
||||
opt_shape_out[0] = opt_shape_in[0];
|
||||
opt_shape_out[1] = opt_shape_in[1];
|
||||
opt_shape_out[2] = outputs[0]->attr.size[3];
|
||||
axis[0] = 2;
|
||||
new_rank[0] = 3;
|
||||
new_rank[1] = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t preprocess_node_params[_REPEAT_PREPROCESS_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t node_params[_REPEAT_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t tmp_node = NULL;
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_kernel_t * kernel_preprocess = NULL;
|
||||
vsi_nn_tensor_t * tensor_preprocess = NULL;
|
||||
vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL;
|
||||
int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
|
||||
int32_t new_rank[2] = {0, 0};
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
|
||||
// Check if gpu can support the size
|
||||
if ( !vsi_nn_kernel_gpu_check_shape(
|
||||
(int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (axis > 2 || outputs[0]->attr.dim_num == 1)
|
||||
{
|
||||
status = _optimize_repeat_shape(inputs, outputs, &axis, new_shape[0], new_shape[1], new_rank);
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], new_rank[0]);
|
||||
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], new_rank[1]);
|
||||
}
|
||||
|
||||
if (inputs[1]->attr.dim_num == 1)
|
||||
{
|
||||
new_shape[0][0] = inputs[1]->attr.size[0];
|
||||
new_shape[0][1] = 1;
|
||||
rs_input1 = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[0], 2);
|
||||
}
|
||||
|
||||
kernel_preprocess = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
|
||||
// Assign unique_id
|
||||
kernel_preprocess->unique_id = kernel->unique_id;
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel_preprocess, kernel, axis );
|
||||
if ( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
attr.size[0] = inputs[1]->attr.size[0];
|
||||
attr.size[1] = 1;
|
||||
attr.dim_num = 2;
|
||||
tensor_preprocess = vsi_nn_CreateTensor( graph, &attr );
|
||||
|
||||
// preprocess
|
||||
tmp_node = vsi_nn_kernel_create_node( graph, kernel_preprocess );
|
||||
if (tmp_node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
if (rs_input1)
|
||||
{
|
||||
preprocess_node_params[index++] = rs_input1;
|
||||
}
|
||||
else
|
||||
{
|
||||
preprocess_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
|
||||
}
|
||||
preprocess_node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_preprocess->t;
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( tmp_node, preprocess_node_params,
|
||||
_REPEAT_PREPROCESS_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U8 = 0;
|
||||
border.constant_value.U16 = 0;
|
||||
border.constant_value.S32 = 0;
|
||||
if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
|
||||
{
|
||||
border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
|
||||
}
|
||||
status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
CHECK_STATUS(status);
|
||||
}
|
||||
}
|
||||
|
||||
// repeat
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if (node)
|
||||
{
|
||||
uint32_t index = 0;
|
||||
if (rs_input)
|
||||
{
|
||||
node_params[index++] = rs_input;
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
|
||||
}
|
||||
if (rs_input1)
|
||||
{
|
||||
node_params[index++] = rs_input1;
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
|
||||
}
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_preprocess->t;
|
||||
if (rs_output)
|
||||
{
|
||||
node_params[index++] = rs_output;
|
||||
}
|
||||
else
|
||||
{
|
||||
node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
|
||||
}
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
|
||||
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params,
|
||||
_REPEAT_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_REPLICATE;
|
||||
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
CHECK_STATUS(status);
|
||||
}
|
||||
}
|
||||
|
||||
/* Pass parameters to node. */
|
||||
final:
|
||||
if (rs_input)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input );
|
||||
}
|
||||
if (rs_input1)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_input1 );
|
||||
}
|
||||
if (rs_output)
|
||||
{
|
||||
vsi_nn_kernel_tensor_release( &rs_output );
|
||||
}
|
||||
if ( kernel_preprocess )
|
||||
{
|
||||
vsi_nn_kernel_release( &kernel_preprocess );
|
||||
}
|
||||
if ( tensor_preprocess )
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &tensor_preprocess );
|
||||
}
|
||||
if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( repeat, _setup )
|
||||
|
||||
|
|
@ -49,11 +49,13 @@ typedef enum
|
|||
UP,
|
||||
UP_OPT,
|
||||
UP_2X_HALF,
|
||||
UP_3X_HALF,
|
||||
UP_4X_HALF,
|
||||
} _internal_scale_e;
|
||||
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_bilinear_"#_input_type
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt"
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(_input_type) "resize_bilinear_"#_input_type"_UP_2X"
|
||||
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers"
|
||||
|
||||
#define STR(a) #a
|
||||
// Add kernel hashtable here
|
||||
|
|
@ -77,8 +79,21 @@ typedef enum
|
|||
|
||||
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_2X_half"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(IN_DTYPE) }
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_SAME_2x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_SAME_4x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
|
||||
|
||||
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
|
||||
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
|
||||
"_SAME_3x_upsample_half_pixel_centers"), \
|
||||
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
|
@ -103,6 +118,8 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
|
|||
PACK_KERNEL_MAP_UP(BF16, BF16),
|
||||
PACK_KERNEL_MAP_UP_OPT(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
|
||||
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
|
||||
};
|
||||
|
||||
|
||||
|
|
@ -203,8 +220,10 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
uint32_t out_height;
|
||||
float half_pixel_value = 0.0f;
|
||||
vsi_bool is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size);
|
||||
vsi_bool is_use_2x_up_half_kernel = FALSE;
|
||||
|
||||
vsi_bool is_half_pixel_centers = FALSE;
|
||||
vsi_bool is_2x_up_kernel = FALSE;
|
||||
vsi_bool is_3x_up_kernel = FALSE;
|
||||
vsi_bool is_4x_up_kernel = FALSE;
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
|
|
@ -254,11 +273,13 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
half_pixel_value = 0.0f;
|
||||
}
|
||||
|
||||
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
|
||||
is_half_pixel_centers = (!align_corners) && (half_pixel_centers);
|
||||
|
||||
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)) && is_half_pixel_centers)
|
||||
{
|
||||
is_use_2x_up_half_kernel = (!align_corners) && (half_pixel_centers);
|
||||
is_use_2x_up_half_kernel = is_use_2x_up_half_kernel && \
|
||||
(2 * in_width == out_width) && (2 * in_height == out_height);
|
||||
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
|
||||
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
|
||||
is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
|
||||
}
|
||||
|
||||
if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
|
||||
|
|
@ -309,11 +330,17 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
outputZP = 0;
|
||||
}
|
||||
|
||||
if (is_use_2x_up_half_kernel)
|
||||
if (is_2x_up_kernel || is_4x_up_kernel)
|
||||
{
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[0] = 16;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
}
|
||||
else if (is_3x_up_kernel)
|
||||
{
|
||||
gpu_param.global_scale[0] = 15;
|
||||
gpu_param.global_scale[1] = 6;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_scale[0] = 4;
|
||||
|
|
@ -321,28 +348,134 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
gpu_param.global_scale[2] = 1;
|
||||
}
|
||||
|
||||
if (is_use_2x_up_half_kernel)
|
||||
if (is_2x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize2xUp_4x8 = {{
|
||||
gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize2xUpRound_2x8 = {{
|
||||
0x55555555, // TCfg
|
||||
0x44444444, // ASelt
|
||||
0x03020100, 0x07060504, // ABin
|
||||
0xaaaaaaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
|
||||
0x00000704, // AccumType, ConstantType, and PostShift
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001,
|
||||
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103,
|
||||
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_4x8", &uniResize2xUp_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUpRound_2x8", &uniResize2xUpRound_2x8);
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_3x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
|
||||
0x15515515, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x21210110, 0x03323202, // ABin
|
||||
0x2aa2aa2a, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
|
||||
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
|
||||
0x05155155, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x54044343, 0x00650554, // ABin
|
||||
0x0a2aa2aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000610, // AccumType, ConstantType, and PostShift
|
||||
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
|
||||
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
|
||||
0x55551155, // TCfg
|
||||
0x50501050, // ASelt
|
||||
0x01011010, 0x21212121, // ABin
|
||||
0xaaaa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
|
||||
0x11555511, // TCfg
|
||||
0x10505010, // ASelt
|
||||
0x32320202, 0x03033232, // ABin
|
||||
0x22aaaa22, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
|
||||
0x55115555, // TCfg
|
||||
0x50105050, // ASelt
|
||||
0x43434343, 0x54540404, // ABin
|
||||
0xaa22aaaa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
|
||||
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
|
||||
0x00551155, // TCfg
|
||||
0x00501050, // ASelt
|
||||
0x05055454, 0x00006565, // ABin
|
||||
0x00aa22aa, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x0000060f, // AccumType, ConstantType, and PostShift
|
||||
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
|
||||
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
else if (is_4x_up_kernel)
|
||||
{
|
||||
gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
|
||||
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
|
||||
0x55555555, 0x55555555, // TCfg
|
||||
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
|
||||
0x00000406, // AccumType, ConstantType, and PostShift
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305,
|
||||
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
|
@ -832,13 +965,13 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
|
|||
goto final;
|
||||
}
|
||||
|
||||
if (!is_use_2x_up_half_kernel)
|
||||
if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel)
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}
|
||||
|
||||
if (is_use_2x_up_half_kernel)
|
||||
if (is_2x_up_kernel || is_4x_up_kernel)
|
||||
{
|
||||
gpu_param.global_size[0] = gpu_align_p2((out_width + \
|
||||
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
|
||||
|
|
@ -860,8 +993,6 @@ final:
|
|||
return status;
|
||||
} /* _resize_bilinear_initializer() */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
|
|
@ -872,7 +1003,8 @@ static vsi_status _query_kernel
|
|||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool is_same_type,
|
||||
vsi_bool is_evis2,
|
||||
vsi_bool is_2x_up_half,
|
||||
int32_t align_corners,
|
||||
int32_t half_pixel_centers,
|
||||
vsi_bool *is_run_opt_kernel
|
||||
)
|
||||
{
|
||||
|
|
@ -886,17 +1018,35 @@ static vsi_status _query_kernel
|
|||
vx_kernel_initialize_f initializer = _resize_bilinear_initializer;
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
vsi_bool is_2x_upsample =(2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
|
||||
&& (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
|
||||
vsi_bool is_3x_upsample =(3 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
|
||||
&& (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
|
||||
vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
|
||||
&& (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
|
||||
_internal_scale_e scale_flag = UP;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
is_2x_upsample &= (in_dtype == U8);
|
||||
is_3x_upsample &= (in_dtype == U8);
|
||||
is_4x_upsample &= (in_dtype == U8);
|
||||
|
||||
if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
|
||||
{
|
||||
if (is_2x_up_half)
|
||||
if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
|
||||
{
|
||||
scale_flag = UP_2X_HALF;
|
||||
}
|
||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
|
||||
{
|
||||
scale_flag = UP_3X_HALF;
|
||||
}
|
||||
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
|
||||
{
|
||||
scale_flag = UP_4X_HALF;
|
||||
}
|
||||
else if (is_same_type && is_evis2)
|
||||
{
|
||||
scale_flag = UP_OPT;
|
||||
|
|
@ -920,19 +1070,6 @@ static vsi_status _query_kernel
|
|||
}
|
||||
}
|
||||
|
||||
if ((UP_2X_HALF == scale_flag) && (i >= kernel_map_size) && is_same_type && is_evis2)
|
||||
{
|
||||
scale_flag = UP_OPT;
|
||||
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
|
||||
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
|
||||
{
|
||||
scale_flag = UP;
|
||||
|
|
@ -1109,9 +1246,6 @@ OnError:
|
|||
return scale;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
|
|
@ -1131,14 +1265,10 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]);
|
||||
vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
|
||||
vsi_bool is_run_opt_kernel = FALSE;
|
||||
vsi_bool is_2x_up_half = FALSE;
|
||||
vsi_nn_tensor_t* scale = NULL;
|
||||
|
||||
is_2x_up_half = is_same_type && (!align_corners) && (half_pixel_centers);
|
||||
is_2x_up_half = is_2x_up_half && (2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
|
||||
&& (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
|
||||
status = _query_kernel( kernel, inputs, outputs, is_same_type, is_evis2,
|
||||
is_2x_up_half, &is_run_opt_kernel);
|
||||
align_corners, half_pixel_centers, &is_run_opt_kernel);
|
||||
if( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue