Update internal to 1.1.32.1
SHA: 215204 Signed-off-by: Kainan Cha <kainan.zha@verisilicon.com>
This commit is contained in:
parent
8fb3a7e6fb
commit
4d4bc08d6a
|
|
@ -195,14 +195,6 @@ cc_library(
|
|||
"src/kernel/vsi_nn_kernel_param.c",
|
||||
"src/kernel/vsi_nn_gpu.c",
|
||||
"src/kernel/vsi_nn_kernel_gpu_shape_optimize.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c",
|
||||
"src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c",
|
||||
"src/libnnext/vsi_nn_libnnext_resource.c",
|
||||
"src/libnnext/vsi_nn_vxkernel.c",
|
||||
] + [":kernel_srcs"]
|
||||
|
|
|
|||
|
|
@ -156,3 +156,5 @@ DEF_OP(ERF)
|
|||
DEF_OP(ONE_HOT)
|
||||
DEF_OP(NMS)
|
||||
DEF_OP(GROUPED_CONV1D)
|
||||
DEF_OP(SCATTER_ND_UPDATE)
|
||||
DEF_OP(GELU)
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef _VSI_NN_OP_GELU_H
|
||||
#define _VSI_NN_OP_GELU_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
typedef struct _vsi_nn_gelu_param
|
||||
{
|
||||
vsi_bool approximate;
|
||||
} vsi_nn_gelu_param;
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#ifndef _VSI_NN_OP_SCATTER_ND_UPDATE_H
|
||||
#define _VSI_NN_OP_SCATTER_ND_UPDATE_H
|
||||
|
||||
#include "vsi_nn_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct _vsi_nn_scatter_nd_update_param
|
||||
{
|
||||
vsi_bool use_locking;
|
||||
} vsi_nn_scatter_nd_update_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -53,7 +53,11 @@ typedef struct _vsi_nn_signalframe_param
|
|||
uint32_t window_length;
|
||||
uint32_t step;
|
||||
uint32_t pad_end;
|
||||
uint32_t pad;
|
||||
union
|
||||
{
|
||||
uint32_t pad;
|
||||
float pad_value;
|
||||
};
|
||||
uint32_t axis;
|
||||
} vsi_nn_signalframe_param;
|
||||
|
||||
|
|
|
|||
|
|
@ -55,8 +55,7 @@ typedef struct _vsi_nn_spatial_transformer_param
|
|||
float theta_2_1;
|
||||
float theta_2_2;
|
||||
float theta_2_3;
|
||||
|
||||
vsi_nn_spatial_transformer_lcl_data lcl;
|
||||
vsi_bool align_corners;
|
||||
} vsi_nn_spatial_transformer_param;
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
@ -64,4 +63,3 @@ typedef struct _vsi_nn_spatial_transformer_param
|
|||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -63,8 +63,16 @@ typedef struct _vsi_nn_hw_config_t
|
|||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||
uint32_t subGroupSize;
|
||||
#endif
|
||||
uint32_t use_40bits_va;
|
||||
} vsi_nn_hw_config_t;
|
||||
|
||||
typedef struct _vsi_nn_runtime_option_t
|
||||
{
|
||||
int32_t enable_shader;
|
||||
int32_t enable_opcheck;
|
||||
int32_t enable_concat_optimize;
|
||||
} vsi_nn_runtime_option_t;
|
||||
|
||||
/**
|
||||
* Ovxlib NN runtime context.
|
||||
*/
|
||||
|
|
@ -72,6 +80,7 @@ typedef struct _vsi_nn_context_t
|
|||
{
|
||||
vx_context c;
|
||||
vsi_nn_hw_config_t config;
|
||||
vsi_nn_runtime_option_t options;
|
||||
} *vsi_nn_context_t;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -87,7 +87,8 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor
|
|||
(
|
||||
vsi_nn_node_t* node,
|
||||
vsi_nn_tensor_attr_t* input_attr,
|
||||
vsi_nn_tensor_attr_t* weight_attr
|
||||
vsi_nn_tensor_attr_t* weight_attr,
|
||||
vsi_bool use_virtual_tensor
|
||||
);
|
||||
|
||||
vsi_status vsi_nn_internal_deinit_node
|
||||
|
|
|
|||
|
|
@ -170,6 +170,8 @@
|
|||
#include "ops/vsi_nn_op_one_hot.h"
|
||||
#include "ops/vsi_nn_op_nms.h"
|
||||
#include "ops/vsi_nn_op_grouped_conv1d.h"
|
||||
#include "ops/vsi_nn_op_scatter_nd_update.h"
|
||||
#include "ops/vsi_nn_op_gelu.h"
|
||||
/* custom node head define define */
|
||||
#include "custom/vsi_nn_custom_node_type.h"
|
||||
|
||||
|
|
@ -326,6 +328,8 @@ typedef union _vsi_nn_nn_param
|
|||
vsi_nn_one_hot_param one_hot;
|
||||
vsi_nn_nms_param nms;
|
||||
vsi_nn_grouped_conv1d_param grouped_conv1d;
|
||||
vsi_nn_scatter_nd_update_param scatter_nd_update;
|
||||
vsi_nn_gelu_param gelu;
|
||||
uint8_t client_param[128];
|
||||
|
||||
/* custom node data struct define */
|
||||
|
|
|
|||
|
|
@ -721,6 +721,13 @@ vsi_status vsi_nn_SwapHandle
|
|||
void ** old_ptr
|
||||
);
|
||||
|
||||
vsi_bool vsi_nn_ConvertTensor
|
||||
(
|
||||
vsi_nn_graph_t* graph,
|
||||
vsi_nn_tensor_t* input,
|
||||
vsi_nn_tensor_t* output
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ extern "C"{
|
|||
|
||||
#define VSI_NN_VERSION_MAJOR 1
|
||||
#define VSI_NN_VERSION_MINOR 1
|
||||
#define VSI_NN_VERSION_PATCH 32
|
||||
#define VSI_NN_VERSION_PATCH 33
|
||||
#define VSI_NN_VERSION \
|
||||
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
//#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _CPU_ARG_NUM (1)
|
||||
#define _CPU_INPUT_NUM (1)
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
//#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _CPU_ARG_NUM (1)
|
||||
#define _CPU_INPUT_NUM (1)
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -284,4 +283,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( add_mean_std_norm, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -280,4 +279,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( cast, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -223,7 +222,6 @@ static vsi_status _query_kernel
|
|||
}
|
||||
|
||||
return status;
|
||||
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
|
|
@ -303,4 +301,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( clip, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -191,7 +190,6 @@ static vsi_status _query_kernel
|
|||
{
|
||||
*is_use_u8_kernel = FALSE;
|
||||
param_def_size = _DETECT_POST_BOX_F32_PARAM_NUM;
|
||||
|
||||
}
|
||||
|
||||
key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype );
|
||||
|
|
@ -311,4 +309,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( detect_post_box, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
#if 0
|
||||
|
|
@ -188,4 +187,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( detect_post_nms, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -49,6 +49,8 @@ typedef enum
|
|||
UNARY_HSIGMOID,
|
||||
UNARY_MISH,
|
||||
UNARY_ROUND,
|
||||
UNARY_GELU,
|
||||
UNARY_HGELU
|
||||
} unary_type_e;
|
||||
|
||||
/*
|
||||
|
|
@ -94,6 +96,8 @@ typedef enum
|
|||
#define HSIGMOID_OPERATION hard_sigmoid
|
||||
#define MISH_OPERATION mish
|
||||
#define ROUND_OPERATION round
|
||||
#define GELU_OPERATION gelu
|
||||
#define HGELU_OPERATION hard_gelu
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
|
|
@ -117,6 +121,10 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION, UNARY_GELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(GELU_OPERATION, UNARY_GELU, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION, UNARY_HGELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT(HGELU_OPERATION, UNARY_HGELU, F16, F16)
|
||||
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F16, F16)
|
||||
|
|
@ -134,6 +142,10 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION, UNARY_GELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(GELU_OPERATION, UNARY_GELU, F16, F16)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION, UNARY_HGELU, F32, F32)
|
||||
TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION, UNARY_HGELU, F16, F16)
|
||||
|
||||
TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, U8)
|
||||
TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, U8)
|
||||
|
|
@ -143,6 +155,8 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
|
||||
TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8, U8)
|
||||
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, U8)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8, U8)
|
||||
TENSOR_UNARY_KERNELS(HGELU_OPERATION, UNARY_HGELU, U8, U8)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8)
|
||||
|
|
@ -152,6 +166,8 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8)
|
||||
|
||||
TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32, I32)
|
||||
|
||||
|
|
@ -166,6 +182,8 @@ static const struct {
|
|||
#undef HSIGMOID_OPERATION
|
||||
#undef MISH_OPERATION
|
||||
#undef ROUND_OPERATION
|
||||
#undef GELU_OPERATION
|
||||
#undef HGELU_OPERATION
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
|
|
@ -417,4 +435,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( neg, UNARY_NEG )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( mish, UNARY_MISH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( round, UNARY_ROUND )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu, UNARY_GELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu, UNARY_HGELU )
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -210,4 +209,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( grucell_activation, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -210,4 +209,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( grucell_activation_sma, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -331,4 +330,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( l2normalizescale, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -35,7 +35,6 @@
|
|||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
|
@ -240,4 +239,3 @@ static vsi_nn_kernel_node_t _setup
|
|||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( logical_not, _setup )
|
||||
|
||||
|
|
|
|||
|
|
@ -307,7 +307,8 @@ static vsi_status _query_kernel
|
|||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_size;
|
||||
kernel->info.initialize = initializer;
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
kernel_map[i].source_name );
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
|
|
|
|||
|
|
@ -0,0 +1,376 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_eltwise.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define KERNEL_SOURCE_1 "scatter_nd_update"
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _coord_dim) \
|
||||
((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_coord_dim))
|
||||
|
||||
#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(SRC0_TYPE, SRC2_TYPE, DST_TYPE) \
|
||||
CVIVANTE_NAMESPACE("cl.scatter_nd_update_"#SRC0_TYPE#SRC2_TYPE"to"#DST_TYPE)
|
||||
|
||||
#define TENSOR_SCATTER_ND_UPDATE_KERNELS(IN0_TYPE, IN1_TYPE, IN2_TYPE, OUT_TYPE, SOURCE) \
|
||||
{ HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, IN2_TYPE, OUT_TYPE, 0), \
|
||||
HASH_SCATTER_ND_UPDATE_SH_KERNEL_NAME(IN0_TYPE, IN2_TYPE, OUT_TYPE), \
|
||||
SOURCE },
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
char* function_name;
|
||||
const char* source_name;
|
||||
} scatter_nd_update_map[] =
|
||||
{
|
||||
TENSOR_SCATTER_ND_UPDATE_KERNELS(I32, I32, I32, I32, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_KERNELS(U32, I32, U32, U32, KERNEL_SOURCE_1)
|
||||
TENSOR_SCATTER_ND_UPDATE_KERNELS(F32, I32, F32, F32, KERNEL_SOURCE_1)
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _scatter_nd_update_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
#define _SCATTER_ND_UPDATE_PARAM_NUM _cnt_of_array(_scatter_nd_update_kernel_param_def)
|
||||
|
||||
static vsi_status cal_scatter_nd_update_tensor_reshape_size
|
||||
(
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
int32_t sizes[VSI_NN_MAX_DIM_NUM],
|
||||
uint32_t block_size,
|
||||
uint32_t coordDim,
|
||||
uint32_t* width,
|
||||
uint32_t* area,
|
||||
uint32_t* vol,
|
||||
int32_t* newDim
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
uint32_t dims_num = inputs[0]->attr.dim_num;
|
||||
uint32_t *input_size = inputs[0]->attr.size;
|
||||
uint32_t i = 0;
|
||||
uint32_t elementCnt = 1;
|
||||
|
||||
if (coordDim != 0 && (width == NULL || area == NULL))
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
|
||||
|
||||
newDim[0] = 0;
|
||||
for(i = 0; i < dims_num; ++i)
|
||||
{
|
||||
elementCnt *= input_size[i];
|
||||
}
|
||||
|
||||
for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
|
||||
{
|
||||
sizes[i] = 1;
|
||||
}
|
||||
|
||||
if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
|
||||
{
|
||||
sizes[0] = block_size;
|
||||
sizes[1] = elementCnt / block_size;
|
||||
status = VSI_SUCCESS;
|
||||
newDim[0] = 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
return status;
|
||||
}
|
||||
|
||||
if (coordDim == 1) // index shape
|
||||
{
|
||||
*width = 0;
|
||||
*area = 0;
|
||||
}
|
||||
else if (coordDim == 2)
|
||||
{
|
||||
*width = input_size[dims_num - 2];
|
||||
*area = 0;
|
||||
}
|
||||
else if (coordDim == 3)
|
||||
{
|
||||
*width = input_size[dims_num - 3];
|
||||
*area = input_size[dims_num - 3] * input_size[dims_num - 2];
|
||||
}
|
||||
else if (coordDim == 4)
|
||||
{
|
||||
*width = input_size[dims_num - 4];
|
||||
*area = input_size[dims_num - 4] * input_size[dims_num - 3];
|
||||
*vol = input_size[dims_num - 4] * input_size[dims_num - 3] * input_size[dims_num - 2];
|
||||
}
|
||||
else if (coordDim == 5)
|
||||
{
|
||||
*width = input_size[dims_num - 5];
|
||||
*area = input_size[dims_num - 5] * input_size[dims_num - 4];
|
||||
*vol = input_size[dims_num - 5] * input_size[dims_num - 4] * input_size[dims_num - 3];
|
||||
}
|
||||
#undef VSI_NN_MAX_IMAGE_WIDTH
|
||||
|
||||
return status;
|
||||
} /* _get_EltOP_tensor_reshape_size */
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
|
||||
int32_t block_size = 0;
|
||||
int32_t height = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
|
||||
block_size = attr[0]->shape->data[0];
|
||||
height = attr[0]->shape->data[1];
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = block_size;
|
||||
gpu_param.global_size[1] = height;
|
||||
gpu_param.global_size[2] = 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final);
|
||||
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
return status;
|
||||
} /* _scatter_nd_update_initializer() */
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
int32_t coord_dim
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e input0_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e input2_dtype = U8;
|
||||
vsi_nn_kernel_dtype_e output_dtype = U8;
|
||||
uint32_t key = 0;
|
||||
int i = 0;
|
||||
|
||||
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
|
||||
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, input2_dtype, output_dtype, 0 );
|
||||
|
||||
for( i = 0; i < _cnt_of_array(scatter_nd_update_map); i ++ )
|
||||
{
|
||||
if ( scatter_nd_update_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( i < _cnt_of_array(scatter_nd_update_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", scatter_nd_update_map[i].function_name );
|
||||
kernel->info.parameters = _scatter_nd_update_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _scatter_nd_update_kernel_param_def );
|
||||
kernel->info.initialize = _scatter_nd_update_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"eltwise_ops_helper",
|
||||
scatter_nd_update_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
scatter_nd_update_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
|
||||
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
|
||||
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
|
||||
int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" );
|
||||
int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
|
||||
uint32_t width = 0, area = 0, vol = 0;
|
||||
int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0;
|
||||
|
||||
status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0],
|
||||
coord_dim, 0, NULL, NULL, NULL, &rs_in_dim);
|
||||
status |= cal_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1],
|
||||
block_size, 0, NULL, NULL, NULL, &rs_idx_dim);
|
||||
status |= cal_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2],
|
||||
block_size, coord_dim, &width, &area, &vol, &rs_out_dim);
|
||||
if (status != VSI_SUCCESS)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
|
||||
outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (coord_dim == 5)
|
||||
{
|
||||
offset_idx = 1;
|
||||
}
|
||||
if (coord_dim == 4 || coord_dim == 5)
|
||||
{
|
||||
offsetX = vol;
|
||||
offsetY = area;
|
||||
offsetZ = width;
|
||||
offsetW = 1;
|
||||
}
|
||||
else if (coord_dim == 3)
|
||||
{
|
||||
offsetX = area;
|
||||
offsetY = width;
|
||||
offsetZ = 1;
|
||||
offsetW = 0;
|
||||
}
|
||||
else if (coord_dim == 2)
|
||||
{
|
||||
offsetX = width;
|
||||
offsetY = 1;
|
||||
offsetZ = 0;
|
||||
offsetW = 0;
|
||||
}
|
||||
else if (coord_dim == 1)
|
||||
{
|
||||
offsetX = 1;
|
||||
offsetY = 0;
|
||||
offsetZ = 0;
|
||||
offsetW = 0;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs, coord_dim );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 0;
|
||||
/* Pass parameters to node. */
|
||||
node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[2], rs_out_dim );
|
||||
node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[0], rs_in_dim );
|
||||
node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shapes[1], rs_idx_dim );
|
||||
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetX );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetY );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetZ );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offsetW );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &offset_idx );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
|
||||
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ND_UPDATE_PARAM_NUM );
|
||||
CHECK_STATUS(status);
|
||||
vsi_nn_kernel_tensor_release( &node_params[0] );
|
||||
vsi_nn_kernel_tensor_release( &node_params[1] );
|
||||
vsi_nn_kernel_tensor_release( &node_params[2] );
|
||||
vsi_nn_kernel_tensor_release( &node_params[3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[6] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[7] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[8] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[9] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[10] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( scatter_nd_update, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,298 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
( ( IN_DTYPE << 8 ) | ( OUT_DTYPE ) )
|
||||
#define SIGNAL_FRAME_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("cl.signal_frame_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||
"signal_frame" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _signal_frame_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
SIGNAL_FRAME_KERNEL_MAP( F32, F32 ),
|
||||
|
||||
SIGNAL_FRAME_KERNEL_MAP( U8, U8)
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _signal_frame_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _SIGNAL_FRAME_PARAM_NUM _cnt_of_array( _signal_frame_kernel_param_def )
|
||||
#define FRAME_STEP (2)
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_signal_frame_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
gpu_param_t gpu_param = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0} // globalWorkSize: image size in thread
|
||||
};
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
|
||||
vsi_int_array_t * out_shape = NULL;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = (
|
||||
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
|
||||
/ gpu_param.global_scale[1]);
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(attr[0]);
|
||||
SAFE_FREE_TENSOR_ATTR(attr[1]);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
|
||||
return status;
|
||||
} /* _erf_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
uint32_t key = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
in_dtype = in_dtype == F16 ? F32 : in_dtype;
|
||||
out_dtype = out_dtype == F16 ? F32 : out_dtype;
|
||||
key = SIGNAL_FRAME_HASH_KEY( in_dtype, out_dtype );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(_signal_frame_kernel_map); i ++ )
|
||||
{
|
||||
if ( _signal_frame_kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(_signal_frame_kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _signal_frame_kernel_map[i].function_name );
|
||||
kernel->info.parameters = _signal_frame_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _signal_frame_kernel_param_def );
|
||||
kernel->info.initialize = _signal_frame_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
|
||||
_signal_frame_kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
_signal_frame_kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t frame_length = vsi_nn_kernel_param_get_int32( params, "frame_length" );
|
||||
int32_t frame_step = vsi_nn_kernel_param_get_int32( params, "frame_step" );
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
|
||||
float pad_value = vsi_nn_kernel_param_get_float32( params, "pad_val" );
|
||||
int32_t num_frames = outputs[0]->attr.size[axis + 1];
|
||||
int32_t rank = inputs[0]->attr.dim_num;
|
||||
int32_t inner = 1;
|
||||
int32_t outer = 1;
|
||||
int32_t length_samples = inputs[0]->attr.size[axis];
|
||||
int32_t i = 0;
|
||||
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
|
||||
int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
|
||||
|
||||
for (i = 0; i < axis; i++)
|
||||
{
|
||||
inner *= inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
for (i = axis + 1; i < rank; i++)
|
||||
{
|
||||
outer *= inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
shape[0][0] = inner;
|
||||
shape[0][1] = length_samples;
|
||||
shape[0][2] = 1;
|
||||
shape[0][3] = outer;
|
||||
|
||||
shape[1][0] = inner;
|
||||
shape[1][1] = frame_length;
|
||||
shape[1][2] = num_frames;
|
||||
shape[1][3] = outer;
|
||||
|
||||
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], (uint32_t*)shape[0], 4 );
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], (uint32_t*)shape[1], 4 );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
|
||||
rs_tensors[1]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status )
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
if ( pad_end )
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
uint32_t data = 0;
|
||||
uint32_t dsize = 1;
|
||||
|
||||
vsi_nn_Float32ToDtype(pad_value, (uint8_t*)&data, &outputs[0]->attr.dtype);
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
dsize = vsi_nn_GetTypeBytes( inputs[0]->attr.dtype.vx_type );
|
||||
if ( dsize == 1 )
|
||||
{
|
||||
border.constant_value.U8 = (uint8_t)data;
|
||||
}
|
||||
else if ( dsize == 4 )
|
||||
{
|
||||
border.constant_value.U32 = data;
|
||||
}
|
||||
else
|
||||
{
|
||||
border.constant_value.U16 = (uint16_t)data;
|
||||
}
|
||||
|
||||
status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
}
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM,
|
||||
rs_tensors, 1, &rs_tensors[1], 1 );
|
||||
|
||||
node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &frame_step );
|
||||
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
if (rs_tensors[0])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[0] );
|
||||
}
|
||||
|
||||
if (rs_tensors[1])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[1] );
|
||||
}
|
||||
|
||||
if (node_params[FRAME_STEP])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] );
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CL( signal_frame, _setup )
|
||||
|
|
@ -22,7 +22,6 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -66,7 +65,6 @@ __BEGIN_DECLS
|
|||
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
|
||||
SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
|
|
@ -221,7 +219,6 @@ static vsi_status _query_kernel
|
|||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
|
|
@ -268,7 +265,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
|
||||
inputs[0]->attr.dim_num ) || input_batch != output_batch )
|
||||
{
|
||||
return NULL;
|
||||
goto final;
|
||||
}
|
||||
|
||||
image_2d = (rank[0] < 3 || shapes[0][2] == 1);
|
||||
|
|
@ -300,6 +297,13 @@ static vsi_nn_kernel_node_t _setup
|
|||
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _IO_NUM; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,535 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (3)
|
||||
#define _OUTPUT_NUM (4)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.box_with_nms_limit")
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _box_with_nms_limit_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _BOX_WITH_NMS_LIMIT_PARAM_NUM _cnt_of_array( _box_with_nms_limit_kernel_param_def )
|
||||
#define SCORE_THRESHOLD (7)
|
||||
#define MAX_NUM_DETECTIONS (8)
|
||||
#define NMS_KERNEL_METHOD (9)
|
||||
#define IOU_THRESHOLD (10)
|
||||
#define SIGMA (11)
|
||||
#define NMS_SCORE_THRESHOLD (12)
|
||||
|
||||
static float hard_nms_kernel
|
||||
(
|
||||
float iou,
|
||||
float iouThreshold
|
||||
)
|
||||
{
|
||||
return iou < iouThreshold ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
static float linear_nms_kernel
|
||||
(
|
||||
float iou,
|
||||
float iouThreshold
|
||||
)
|
||||
{
|
||||
return iou < iouThreshold ? 1.0f : 1.0f - iou;
|
||||
}
|
||||
|
||||
static float gaussian_nms_kernel
|
||||
(
|
||||
float iou,
|
||||
float sigma
|
||||
)
|
||||
{
|
||||
return (float)(exp(-1.0f * iou * iou / sigma));
|
||||
}
|
||||
|
||||
void swap_element
|
||||
(
|
||||
uint32_t* list,
|
||||
uint32_t first,
|
||||
uint32_t second
|
||||
)
|
||||
{
|
||||
uint32_t temp = list[first];
|
||||
list[first] = list[second];
|
||||
list[second] = temp;
|
||||
}
|
||||
|
||||
uint32_t max_element
|
||||
(
|
||||
float* data,
|
||||
uint32_t* index_list,
|
||||
uint32_t len
|
||||
)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t max_index = 0;
|
||||
float max_val = data[index_list[0]];
|
||||
for(i = 1; i < len; i++)
|
||||
{
|
||||
float val = data[index_list[i]];
|
||||
if (max_val < val)
|
||||
{
|
||||
max_val = val;
|
||||
max_index = i;
|
||||
}
|
||||
}
|
||||
return max_index;
|
||||
}
|
||||
|
||||
static uint32_t max_comp_func
|
||||
(
|
||||
void* data,
|
||||
int32_t left,
|
||||
int32_t right
|
||||
)
|
||||
{
|
||||
float* fdata = (float*)data;
|
||||
return fdata[left] >= fdata[right];
|
||||
}
|
||||
|
||||
void sort_element_by_score
|
||||
(
|
||||
float* data,
|
||||
uint32_t* index_list,
|
||||
uint32_t len
|
||||
)
|
||||
{
|
||||
vsi_nn_partition(data, 0, len - 1, max_comp_func, TRUE, index_list);
|
||||
}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float* fdata;
|
||||
uint32_t numClasses;
|
||||
} class_comp_param;
|
||||
|
||||
static uint32_t class_comp_func
|
||||
(
|
||||
void* data,
|
||||
int32_t left,
|
||||
int32_t right
|
||||
)
|
||||
{
|
||||
class_comp_param *p = (class_comp_param*)data;
|
||||
float* fdata = p->fdata;
|
||||
uint32_t numClasses = p->numClasses;
|
||||
uint32_t lhsClass = left % numClasses, rhsClass = right % numClasses;
|
||||
return lhsClass == rhsClass ? fdata[left] > fdata[right]
|
||||
: lhsClass < rhsClass;
|
||||
}
|
||||
|
||||
static void sort_element_by_class
|
||||
(
|
||||
float* data,
|
||||
uint32_t* index_list,
|
||||
uint32_t len,
|
||||
uint32_t numClasses
|
||||
)
|
||||
{
|
||||
class_comp_param class_comp;
|
||||
class_comp.fdata = data;
|
||||
class_comp.numClasses = numClasses;
|
||||
vsi_nn_partition(&class_comp, 0, len - 1, class_comp_func, TRUE, index_list);
|
||||
}
|
||||
|
||||
// Taking two indices of bounding boxes, return the intersection-of-union.
|
||||
float getIoUAxisAligned
|
||||
(
|
||||
const float* roi1,
|
||||
const float* roi2
|
||||
)
|
||||
{
|
||||
const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
|
||||
const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
|
||||
const float x1 = vsi_nn_max(roi1[0], roi2[0]);
|
||||
const float x2 = vsi_nn_min(roi1[2], roi2[2]);
|
||||
const float y1 = vsi_nn_max(roi1[1], roi2[1]);
|
||||
const float y2 = vsi_nn_min(roi1[3], roi2[3]);
|
||||
const float w = vsi_nn_max(x2 - x1, 0.0f);
|
||||
const float h = vsi_nn_max(y2 - y1, 0.0f);
|
||||
const float areaIntersect = w * h;
|
||||
const float areaUnion = area1 + area2 - areaIntersect;
|
||||
return areaIntersect / areaUnion;
|
||||
}
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
int32_t* int32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
int32_t* int32_out_buffer[_OUTPUT_NUM] = {0};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
uint32_t i = 0;
|
||||
float score_threshold = 0;
|
||||
int32_t max_num_detections = 0;
|
||||
int32_t nms_kernel_method = 0;
|
||||
float iou_threshold = 0;
|
||||
float sigma = 0;
|
||||
float nms_score_threshold = 0;
|
||||
uint32_t j = 0, n = 0, b = 0, c = 0;
|
||||
const uint32_t kRoiDim = 4;
|
||||
uint32_t numRois = 0;
|
||||
uint32_t numClasses = 0;
|
||||
int32_t ind = 0;
|
||||
uint32_t * batch_data = NULL;
|
||||
int32_t numBatch = 0;
|
||||
uint32_t * select = NULL;
|
||||
uint32_t select_size = 0;
|
||||
uint32_t scores_index = 0;
|
||||
uint32_t roi_index = 0;
|
||||
uint32_t roi_out_index = 0;
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
if (i == 2)
|
||||
{
|
||||
int32_in_buffer[i] = (int32_t*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( int32_in_buffer[i], "Create input buffer fail.", final );
|
||||
}
|
||||
else
|
||||
{
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
if (i < 2)
|
||||
{
|
||||
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
else
|
||||
{
|
||||
int32_out_buffer[i] = (int32_t *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( int32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( int32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
}
|
||||
|
||||
#define VSI_NN_KERNEL_READ_SCALAR(type, idx, pointer) \
|
||||
vsi_nn_kernel_scalar_read_##type((vsi_nn_kernel_scalar_t)param[idx], pointer)
|
||||
|
||||
status = VSI_NN_KERNEL_READ_SCALAR(float32, SCORE_THRESHOLD, &score_threshold);
|
||||
status |= VSI_NN_KERNEL_READ_SCALAR(int32, MAX_NUM_DETECTIONS, &max_num_detections);
|
||||
status |= VSI_NN_KERNEL_READ_SCALAR(int32, NMS_KERNEL_METHOD, &nms_kernel_method);
|
||||
status |= VSI_NN_KERNEL_READ_SCALAR(float32, IOU_THRESHOLD, &iou_threshold);
|
||||
status |= VSI_NN_KERNEL_READ_SCALAR(float32, SIGMA, &sigma);
|
||||
status |= VSI_NN_KERNEL_READ_SCALAR(float32, NMS_SCORE_THRESHOLD, &nms_score_threshold);
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
#undef VSI_NN_KERNEL_READ_SCALAR
|
||||
|
||||
numRois = in_attr[0]->shape->data[1];
|
||||
numClasses = in_attr[0]->shape->data[0];
|
||||
|
||||
batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t));
|
||||
CHECK_PTR_FAIL_GOTO( batch_data, "Create batch_data fail.", final );
|
||||
memset(batch_data, 0, numRois * sizeof(uint32_t));
|
||||
|
||||
for (i = 0, ind = -1; i < numRois; i++)
|
||||
{
|
||||
if (int32_in_buffer[2][i] != ind)
|
||||
{
|
||||
ind = int32_in_buffer[2][i];
|
||||
numBatch++;
|
||||
}
|
||||
batch_data[numBatch - 1]++;
|
||||
}
|
||||
select = (uint32_t*)malloc(numBatch * numRois
|
||||
* numClasses * sizeof(uint32_t));
|
||||
CHECK_PTR_FAIL_GOTO( select, "Create select fail.", final );
|
||||
memset(select, 0, numBatch * numRois * numClasses * sizeof(uint32_t));
|
||||
for (n = 0; n < (uint32_t)numBatch; n++)
|
||||
{
|
||||
int32_t numDetections_batch = 0;
|
||||
uint32_t select_start_batch = select_size;
|
||||
uint32_t select_len = 0;
|
||||
// Exclude class 0 (background)
|
||||
for (c = 1; c < numClasses; c++)
|
||||
{
|
||||
uint32_t select_start = select_size;
|
||||
int32_t maxNumDetections0 = max_num_detections;
|
||||
uint32_t numDetections = 0;
|
||||
for (b = 0; b < batch_data[n]; b++)
|
||||
{
|
||||
uint32_t index = b * numClasses + c;
|
||||
float score = f32_in_buffer[0][scores_index + index];
|
||||
if (score > score_threshold) {
|
||||
select[select_size] = index;
|
||||
select_size++;
|
||||
}
|
||||
}
|
||||
select_len = select_size - select_start;
|
||||
|
||||
if (maxNumDetections0 < 0)
|
||||
{
|
||||
maxNumDetections0 = select_len;
|
||||
}
|
||||
|
||||
for (j = 0; (j < select_len && numDetections < (uint32_t)maxNumDetections0); j++)
|
||||
{
|
||||
// find max score and swap to the front.
|
||||
int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]),
|
||||
&(select[select_start + j]), select_len - j) + j;
|
||||
|
||||
swap_element(&(select[select_start]), max_index, j);
|
||||
|
||||
// Calculate IoU of the rest, swap to the end (disgard) if needed.
|
||||
for (i = j + 1; i < select_len; i++)
|
||||
{
|
||||
int32_t roiBase0 = roi_index + select[select_start + i] * kRoiDim;
|
||||
int32_t roiBase1 = roi_index + select[select_start + j] * kRoiDim;
|
||||
float iou = getIoUAxisAligned(&(f32_in_buffer[1][roiBase0]),
|
||||
&(f32_in_buffer[1][roiBase1]));
|
||||
float kernel_iou;
|
||||
if (nms_kernel_method == 0)
|
||||
{
|
||||
kernel_iou = hard_nms_kernel(iou, iou_threshold);
|
||||
}
|
||||
else if (nms_kernel_method == 1)
|
||||
{
|
||||
kernel_iou = linear_nms_kernel(iou, iou_threshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel_iou = gaussian_nms_kernel(iou, sigma);
|
||||
}
|
||||
f32_in_buffer[0][scores_index + select[select_start + i]] *= kernel_iou;
|
||||
if (f32_in_buffer[0][scores_index + select[select_start + i]] < nms_score_threshold)
|
||||
{
|
||||
swap_element(&(select[select_start]), i, select_len - 1);
|
||||
i--;
|
||||
select_len--;
|
||||
}
|
||||
}
|
||||
numDetections++;
|
||||
}
|
||||
select_size = select_start + select_len;
|
||||
numDetections_batch += numDetections;
|
||||
}
|
||||
|
||||
// Take top max_num_detections.
|
||||
sort_element_by_score(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
|
||||
numDetections_batch);
|
||||
|
||||
if (numDetections_batch > max_num_detections && max_num_detections >= 0)
|
||||
{
|
||||
select_size = select_start_batch + max_num_detections;
|
||||
}
|
||||
select_len = select_size - select_start_batch;
|
||||
// Sort again by class.
|
||||
sort_element_by_class(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
|
||||
select_len, numClasses);
|
||||
|
||||
for (i = 0; i < select_len; i++)
|
||||
{
|
||||
int32_t in_index0 = scores_index + select[select_start_batch + i];
|
||||
int32_t in_index1 = roi_index + select[select_start_batch + i] * kRoiDim;
|
||||
f32_out_buffer[0][roi_out_index] = f32_in_buffer[0][in_index0];
|
||||
memcpy(&(f32_out_buffer[1][roi_out_index * kRoiDim]),
|
||||
&f32_in_buffer[1][in_index1], kRoiDim * sizeof(float));
|
||||
int32_out_buffer[2][roi_out_index] = select[select_start_batch + i] % numClasses;
|
||||
int32_out_buffer[3][roi_out_index] = n;
|
||||
roi_out_index++;
|
||||
}
|
||||
|
||||
scores_index += batch_data[n] * numClasses;
|
||||
roi_index += batch_data[n] * numClasses * kRoiDim;
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (i < 2)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
|
||||
int32_out_buffer[i], out_bytes[i] );
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
final:
|
||||
vsi_nn_safe_free(batch_data);
|
||||
vsi_nn_safe_free(select);
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
vsi_nn_safe_free(f32_in_buffer[i]);
|
||||
vsi_nn_safe_free(int32_in_buffer[i]);
|
||||
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
vsi_nn_safe_free(f32_out_buffer[i]);
|
||||
vsi_nn_safe_free(int32_out_buffer[i]);
|
||||
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _box_with_nms_limit_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _box_with_nms_limit_kernel_param_def );
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_BOX_WITH_NMS_LIMIT_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
float score_threshold = vsi_nn_kernel_param_get_float32( params, "score_threshold" );
|
||||
int32_t max_num_detections = vsi_nn_kernel_param_get_int32( params, "max_num_detections" );
|
||||
int32_t nms_kernel_method = vsi_nn_kernel_param_get_int32( params, "nms_kernel_method" );
|
||||
float iou_threshold = vsi_nn_kernel_param_get_float32( params, "iou_threshold" );
|
||||
float sigma = vsi_nn_kernel_param_get_float32( params, "sigma" );
|
||||
float nms_score_threshold = vsi_nn_kernel_param_get_float32( params, "nms_score_threshold" );
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status )
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &score_threshold );
|
||||
node_params[MAX_NUM_DETECTIONS] = vsi_nn_kernel_scalar_create( graph, I32, &max_num_detections );
|
||||
node_params[NMS_KERNEL_METHOD] = vsi_nn_kernel_scalar_create( graph, I32, &nms_kernel_method );
|
||||
node_params[IOU_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold );
|
||||
node_params[SIGMA] = vsi_nn_kernel_scalar_create( graph, F32, &sigma );
|
||||
node_params[NMS_SCORE_THRESHOLD] = vsi_nn_kernel_scalar_create( graph, F32, &nms_score_threshold );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _BOX_WITH_NMS_LIMIT_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SCORE_THRESHOLD] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[MAX_NUM_DETECTIONS] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[NMS_KERNEL_METHOD] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[IOU_THRESHOLD] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[SIGMA] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[NMS_SCORE_THRESHOLD] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( box_with_nms_limit, _setup )
|
||||
|
|
@ -47,6 +47,8 @@ typedef enum
|
|||
UNARY_HSIGMOID,
|
||||
UNARY_MISH,
|
||||
UNARY_ROUND,
|
||||
UNARY_GELU,
|
||||
UNARY_HGELU,
|
||||
} unary_type_e;
|
||||
|
||||
|
||||
|
|
@ -109,6 +111,58 @@ static float round_eval(float data)
|
|||
return data;
|
||||
}
|
||||
|
||||
static float erf_eval(float x)
|
||||
{
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1; /*n!*/
|
||||
float x_pow = x;
|
||||
int32_t one = 1;
|
||||
int32_t n = 1;
|
||||
|
||||
if (x <= -3)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
else if (x >= 3)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (vsi_abs(tmp) > 1e-5)
|
||||
{
|
||||
res += tmp;
|
||||
|
||||
factorial *= n;
|
||||
one *= -1;
|
||||
x_pow *= x * x;
|
||||
tmp = one / factorial * x_pow / ( 2 * n + 1);
|
||||
|
||||
n ++;
|
||||
}
|
||||
#define VSI_MUL2_RSQRTPI (1.1283791670955126f)
|
||||
|
||||
res *= VSI_MUL2_RSQRTPI;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static float gelu_eval(float data)
|
||||
{
|
||||
data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f))));
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
#define VSI_SQRT_2_RCP_PI 0.7978845834732056f
|
||||
static float hgelu_eval(float data)
|
||||
{
|
||||
float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI *
|
||||
(data + 0.044715f * data * data * data)))));
|
||||
|
||||
return data * cdf;
|
||||
}
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
|
|
@ -176,6 +230,12 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
|
|||
case UNARY_ROUND:
|
||||
data = round_eval(data);
|
||||
break;
|
||||
case UNARY_GELU:
|
||||
data = gelu_eval(data);
|
||||
break;
|
||||
case UNARY_HGELU:
|
||||
data = hgelu_eval(data);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
@ -309,4 +369,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu, UNARY_ELU )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg, UNARY_NEG )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish, UNARY_MISH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( gelu, UNARY_GELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_gelu, UNARY_HGELU )
|
||||
|
|
@ -101,11 +101,11 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
#define ERF_PI 3.141592653589793
|
||||
#define VSI_ERF_PI 3.141592653589793
|
||||
for (i = 0; i < out_elements[0]; i ++)
|
||||
{
|
||||
/* 2 / sqrt(pi) * (sum[(-1)^n! * x ^ (2n + 1)] + x) */
|
||||
float x = f32_in_buffer[0][i];
|
||||
float x = vsi_clamp(f32_in_buffer[0][i], -2, 2);
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1; /*n!*/
|
||||
|
|
@ -126,7 +126,7 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
}
|
||||
|
||||
|
||||
res *= 2.0f / (float)sqrt(ERF_PI);
|
||||
res *= 2.0f / (float)sqrt(VSI_ERF_PI);
|
||||
|
||||
f32_out_buffer[0][i] = res;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,188 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.extra_ending")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _extra_ending_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _EXTRA_ENDING_PARAM_NUM _cnt_of_array( _extra_ending_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
uint8_t *u8_in_buffer[_INPUT_NUM] = {NULL};
|
||||
uint8_t *u8_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
int32_t i = 0;
|
||||
|
||||
/* prepare data */
|
||||
input[1] = (vsi_nn_kernel_tensor_t)param[1];
|
||||
in_attr[1] = vsi_nn_kernel_tensor_attr_create( input[1] );
|
||||
u8_in_buffer[1] = (uint8_t*)vsi_nn_kernel_tensor_create_buffer( input[1], in_attr[1], FALSE );
|
||||
CHECK_PTR_FAIL_GOTO( u8_in_buffer[i], "Create input buffer fail.", final );
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(uint8_t);
|
||||
u8_out_buffer[i] = (uint8_t *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( u8_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( u8_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
|
||||
memcpy(u8_out_buffer[0], u8_in_buffer[1], out_bytes[0]);
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
|
||||
u8_out_buffer[i], out_bytes[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
vsi_nn_safe_free(u8_in_buffer[i]);
|
||||
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
vsi_nn_safe_free(u8_out_buffer[i]);
|
||||
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _extra_ending_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _extra_ending_kernel_param_def );
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_EXTRA_ENDING_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _EXTRA_ENDING_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _EXTRA_ENDING_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( extra_ending, _setup )
|
||||
|
|
@ -0,0 +1,323 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <float.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (2)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.heatmap_max_keypoint")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _heatmap_max_keypoint_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _HEATMAP_MAX_KEYPOINT_PARAM_NUM _cnt_of_array( _heatmap_max_keypoint_kernel_param_def )
|
||||
|
||||
// This function uses Taylor expansion up to the quatratic term to approximate bicubic
|
||||
// upscaling result.
|
||||
// 2nd order Taylor expansion: D(x) = D - b'x + 1/2 * x'Ax
|
||||
// where D = grid[1][1], Taylor expansion center, the original score,
|
||||
// x = delta, the correction on max keypoint position,
|
||||
// D(x) = deltaScore, the accuracy score after correction
|
||||
static void _solve_for_delta
|
||||
(
|
||||
const float grid[3][3],
|
||||
float* delta,
|
||||
float* deltaScore,
|
||||
float fpAtol,
|
||||
float fpRtol
|
||||
)
|
||||
{
|
||||
// b: negative 1st order derivative at center
|
||||
// A: Hessian matrix at center (2nd order derivative)
|
||||
float A[2][2], b[2];
|
||||
float crossProd1, crossProd2;
|
||||
float detA;
|
||||
b[0] = -(grid[1][2] - grid[1][0]) / 2.0f;
|
||||
b[1] = -(grid[2][1] - grid[0][1]) / 2.0f;
|
||||
A[0][0] = grid[1][0] - 2.0f * grid[1][1] + grid[1][2];
|
||||
A[0][1] = (grid[2][2] - grid[2][0] - grid[0][2] + grid[0][0]) / 4.0f;
|
||||
A[1][0] = A[0][1];
|
||||
A[1][1] = grid[0][1] - 2.0f * grid[1][1] + grid[2][1];
|
||||
|
||||
// solve Ax=b, where x=delta -> delta = inv(A) * b
|
||||
crossProd1 = A[0][0] * A[1][1];
|
||||
crossProd2 = A[0][1] * A[1][0];
|
||||
detA = crossProd1 - crossProd2;
|
||||
// check if A is invertible
|
||||
if (fabs(detA) < (fpAtol + fpRtol * crossProd1)) return;
|
||||
delta[0] = (A[1][1] * b[0] - A[0][1] * b[1]) / detA;
|
||||
delta[1] = (A[0][0] * b[1] - A[1][0] * b[0]) / detA;
|
||||
|
||||
// clip out of range delta, i.e. delta > 3/2
|
||||
if (fabs(delta[0]) > 1.5f || fabs(delta[1]) > 1.5f)
|
||||
{
|
||||
float scale = (float)(1.5f / vsi_nn_max(fabs(delta[0]), fabs(delta[1])));
|
||||
delta[0] *= scale;
|
||||
delta[1] *= scale;
|
||||
}
|
||||
|
||||
*deltaScore = grid[1][1] - b[0] * delta[0] - b[1] * delta[1] +
|
||||
((A[0][0] * delta[0] + A[0][1] * delta[1]) * delta[0] +
|
||||
(A[1][0] * delta[0] + A[1][1] * delta[1]) * delta[1]) /
|
||||
2.0f;
|
||||
}
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
uint32_t i = 0;
|
||||
uint32_t j = 0;
|
||||
uint32_t k = 0;
|
||||
uint32_t numBoxes = 0;
|
||||
uint32_t heatmapSize = 0;
|
||||
uint32_t numKeypoints = 0;
|
||||
uint32_t boxInfoLength = 4;
|
||||
uint32_t output_score_index = 0;
|
||||
uint32_t output_keypoint_index = 0;
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
|
||||
}
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
|
||||
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
|
||||
numBoxes = in_attr[0]->shape->data[3];
|
||||
heatmapSize = in_attr[0]->shape->data[2];
|
||||
numKeypoints = in_attr[0]->shape->data[0];
|
||||
|
||||
for(i = 0; i < numBoxes; i++)
|
||||
{
|
||||
for (j = 0; j < numKeypoints; j++)
|
||||
{
|
||||
uint32_t maxIndex = 0;
|
||||
float maxScore = -FLT_MAX;
|
||||
uint32_t maxIndexWidth;
|
||||
uint32_t maxIndexHeight;
|
||||
float localGrid[3][3] = {{0}};
|
||||
int32_t dh, dw;
|
||||
float delta[2] = {0.0f, 0.0f}, deltaScore;
|
||||
float wRoiStart = f32_in_buffer[1][i * boxInfoLength];
|
||||
float hRoiStart = f32_in_buffer[1][i * boxInfoLength + 1];
|
||||
float wRoiEnd = f32_in_buffer[1][i * boxInfoLength + 2];
|
||||
float hRoiEnd = f32_in_buffer[1][i * boxInfoLength + 3];
|
||||
float roiWidth = wRoiEnd - wRoiStart;
|
||||
float roiHeight = hRoiEnd - hRoiStart;
|
||||
float wRelativePos;
|
||||
float hRelativePos;
|
||||
for (k = 0; k < heatmapSize * heatmapSize; k++)
|
||||
{
|
||||
uint32_t index = i * heatmapSize * heatmapSize * numKeypoints
|
||||
+ k * numKeypoints + j;
|
||||
float val = f32_in_buffer[0][index];
|
||||
if (maxScore < val)
|
||||
{
|
||||
maxScore = val;
|
||||
maxIndex = k;
|
||||
}
|
||||
}
|
||||
maxIndexWidth = maxIndex % heatmapSize;
|
||||
maxIndexHeight = maxIndex / heatmapSize;
|
||||
|
||||
// get local 3x3 grid
|
||||
for (dh = -1; dh <= 1; dh++)
|
||||
{
|
||||
for (dw = -1; dw <= 1; dw++)
|
||||
{
|
||||
// cast uint32_t to int32_t
|
||||
int32_t h = (int32_t)(maxIndexHeight) + dh;
|
||||
int32_t w = (int32_t)(maxIndexWidth) + dw;
|
||||
uint32_t heatmapIndex;
|
||||
|
||||
// use mirroring for out of bound indexing
|
||||
// need to ensure heatmapSize >= 2
|
||||
h = h < 0 ? 1 : (h >= (int32_t)heatmapSize ? heatmapSize - 2 : h);
|
||||
w = w < 0 ? 1 : (w >= (int32_t)heatmapSize ? heatmapSize - 2 : w);
|
||||
|
||||
heatmapIndex = i * heatmapSize * heatmapSize * numKeypoints +
|
||||
(uint32_t)(h) * heatmapSize * numKeypoints +
|
||||
(uint32_t)(w) * numKeypoints + j;
|
||||
localGrid[dh + 1][dw + 1] = f32_in_buffer[0][heatmapIndex];
|
||||
}
|
||||
}
|
||||
deltaScore = maxScore;
|
||||
_solve_for_delta((const float (*)[3])localGrid, delta, &deltaScore, 1e-3f, 1e-3f);
|
||||
|
||||
wRelativePos = ((float)(maxIndexWidth) + delta[0] + 0.5f) /
|
||||
(float)(heatmapSize);
|
||||
hRelativePos = ((float)(maxIndexHeight) + delta[1] + 0.5f) /
|
||||
(float)(heatmapSize);
|
||||
f32_out_buffer[0][output_score_index] = deltaScore;
|
||||
f32_out_buffer[1][output_keypoint_index] = wRelativePos * roiWidth + wRoiStart;
|
||||
f32_out_buffer[1][output_keypoint_index + 1] = hRelativePos * roiHeight + hRoiStart;
|
||||
output_score_index++;
|
||||
output_keypoint_index += 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
vsi_nn_safe_free(f32_in_buffer[i]);
|
||||
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
vsi_nn_safe_free(f32_out_buffer[i]);
|
||||
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_SUCCESS;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _heatmap_max_keypoint_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _heatmap_max_keypoint_kernel_param_def );
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_HEATMAP_MAX_KEYPOINT_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _HEATMAP_MAX_KEYPOINT_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _HEATMAP_MAX_KEYPOINT_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( heatmap_max_keypoint, _setup )
|
||||
|
|
@ -0,0 +1,285 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2019 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _CPU_ARG_NUM (3)
|
||||
#define _CPU_INPUT_NUM (3)
|
||||
#define _CPU_OUTPUT_NUM (1)
|
||||
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
|
||||
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.scatter_nd_update")
|
||||
|
||||
DEF_KERNEL_EXECUTOR(_scatter_nd_update_exec)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
|
||||
uint32_t * para_buffer[1] = { NULL };
|
||||
uint32_t * mask = NULL;
|
||||
float * buffer[3] = { NULL };
|
||||
size_t out_elements = 0;
|
||||
vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL };
|
||||
int32_t i = 0, j = 0;
|
||||
int32_t block_size = 1, indices_num = 1;
|
||||
int32_t coord_dim = 1;
|
||||
int32_t mask_len = 0;
|
||||
|
||||
tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; // ref
|
||||
tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; // idx int
|
||||
tensors[2] = (vsi_nn_kernel_tensor_t)param[2]; // update
|
||||
tensors[3] = (vsi_nn_kernel_tensor_t)param[3]; // output
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
|
||||
attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
|
||||
|
||||
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
|
||||
|
||||
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
|
||||
|
||||
para_buffer[0] = (uint32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
|
||||
CHECK_PTR_FAIL_GOTO( para_buffer[0], "Create input1 buffer fail.", final );
|
||||
|
||||
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input2 buffer fail.", final );
|
||||
|
||||
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(block_size));
|
||||
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &(coord_dim));
|
||||
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &(indices_num));
|
||||
|
||||
buffer[2] = (float *)malloc( out_elements * sizeof(float) );
|
||||
CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
|
||||
memcpy( buffer[2], buffer[0], out_elements * sizeof(float) );
|
||||
|
||||
mask_len = (int32_t)out_elements / block_size;
|
||||
mask = (uint32_t *)malloc( mask_len * sizeof(uint32_t) );
|
||||
memset(mask, 0, mask_len * sizeof(uint32_t));
|
||||
|
||||
if (coord_dim <= 5)
|
||||
{
|
||||
int32_t stride[5] = {0, 0, 0, 0, 0};
|
||||
int32_t new_shape[5] = {1, 1, 1, 1, 1};
|
||||
int32_t merge_dim = (int32_t)attr[3]->shape->size - coord_dim + 1;
|
||||
|
||||
for(i = 0; i < merge_dim; ++i)
|
||||
{
|
||||
new_shape[0] *= attr[3]->shape->data[i];
|
||||
}
|
||||
stride[0] = new_shape[0] / block_size;
|
||||
|
||||
for(i = 1; i < coord_dim; ++i)
|
||||
{
|
||||
new_shape[i] = attr[3]->shape->data[merge_dim + i - 1];
|
||||
|
||||
stride[i] = stride[i - 1] * new_shape[i];
|
||||
}
|
||||
|
||||
for(i = 0; i < indices_num; i++)
|
||||
{
|
||||
uint32_t in_index = i * block_size;
|
||||
uint32_t out_index = 0;
|
||||
uint32_t coord[5] = {0};
|
||||
int32_t byd_flg = 0;
|
||||
int32_t mask_idx = 0;
|
||||
|
||||
for(j = 0; j < coord_dim; j++)
|
||||
{
|
||||
coord[j] = para_buffer[0][i * coord_dim + coord_dim - j - 1];
|
||||
if (coord[j] >= (uint32_t)new_shape[j])
|
||||
{
|
||||
byd_flg = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (byd_flg)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
mask_idx = coord[4] * stride[3] + coord[3] * stride[2] +
|
||||
coord[2] * stride[1] + coord[1] * stride[0] + coord[0];
|
||||
out_index = mask_idx * block_size;
|
||||
if (mask[mask_idx] == 0)
|
||||
{
|
||||
memset(buffer[2] + out_index, 0, block_size * sizeof(float));
|
||||
mask[mask_idx] = 1;
|
||||
}
|
||||
for(j = 0; j < block_size; j++)
|
||||
{
|
||||
buffer[2][out_index + j] += buffer[1][in_index + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
|
||||
buffer[2], out_elements );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
final:
|
||||
if ( para_buffer[0] )
|
||||
{
|
||||
free( para_buffer[0] );
|
||||
}
|
||||
|
||||
if (mask)
|
||||
{
|
||||
free(mask);
|
||||
}
|
||||
for( i = 0; i < 3; i ++ )
|
||||
{
|
||||
if ( buffer[i] )
|
||||
{
|
||||
free( buffer[i] );
|
||||
}
|
||||
}
|
||||
for( i = 0; i < 4; i ++ )
|
||||
{
|
||||
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
|
||||
}
|
||||
return status;
|
||||
} /* _scatter_nd_update_exec() */
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _scatter_nd_update_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
|
||||
static const vx_kernel_description_t _kernel_info =
|
||||
{
|
||||
KERNEL_ID_PLACEHOLDER,
|
||||
_KERNEL_NAME,
|
||||
_scatter_nd_update_exec,
|
||||
_scatter_nd_update_kernel_param_def,
|
||||
_cnt_of_array( _scatter_nd_update_kernel_param_def ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_tensor_t* const* const inputs,
|
||||
vsi_nn_tensor_t* const* const outputs,
|
||||
vsi_nn_kernel_t* kernel
|
||||
)
|
||||
{
|
||||
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
|
||||
return VSI_SUCCESS;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
|
||||
int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
|
||||
int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" );
|
||||
|
||||
status = _query_kernel( inputs, outputs, kernel );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
uint32_t index = 4;
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
|
||||
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
|
||||
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
|
||||
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
|
||||
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &idx_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
|
||||
CHECK_STATUS( status );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[4] );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[5] );
|
||||
vsi_nn_kernel_scalar_release( &backend_params[6] );
|
||||
}
|
||||
else
|
||||
{
|
||||
status = VSI_FAILURE;
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( scatter_nd_update, _setup )
|
||||
|
||||
|
|
@ -0,0 +1,289 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.signal_frame")
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _signal_frame_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _SIGNAL_FRAME_PARAM_NUM _cnt_of_array( _signal_frame_kernel_param_def )
|
||||
#define FRAME_LENGHT (2)
|
||||
#define FRAME_STEP (3)
|
||||
#define AXIS (4)
|
||||
#define PAD_END (5)
|
||||
#define PAD_VAL (6)
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
int32_t i = 0;
|
||||
int32_t j = 0;
|
||||
int32_t k = 0;
|
||||
int32_t frame_length = 0;
|
||||
int32_t frame_step = 0;
|
||||
int32_t axis = 0;
|
||||
int32_t pad_end = 0;
|
||||
int32_t length_samples = 0;
|
||||
int32_t num_frames = 0;
|
||||
int32_t inner_dim = 1;
|
||||
int32_t outer_dim = 1;
|
||||
int32_t inner_size = 1;
|
||||
float pad_val = 0;
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[FRAME_LENGHT], &frame_length);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[FRAME_STEP], &frame_step);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[AXIS], &axis);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[PAD_END], &pad_end);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[PAD_VAL], &pad_val);
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
for (i = 0; i < axis; i++)
|
||||
{
|
||||
inner_dim *= in_attr[0]->shape->data[i];
|
||||
}
|
||||
length_samples = in_attr[0]->shape->data[axis];
|
||||
for (i = axis + 1; i < (int32_t)in_attr[0]->shape->size; i++)
|
||||
{
|
||||
outer_dim *= in_attr[0]->shape->data[i];
|
||||
}
|
||||
|
||||
for (i = 0; i < axis + 1; i++)
|
||||
{
|
||||
inner_size *= out_attr[0]->shape->data[i];
|
||||
}
|
||||
|
||||
num_frames = (length_samples + frame_step - 1) / frame_step;
|
||||
num_frames = pad_end ? num_frames : (length_samples - frame_length) / frame_step + 1;
|
||||
|
||||
for (i = 0; i < outer_dim; i++)
|
||||
{
|
||||
float * src_ptr = f32_in_buffer[0] + i * length_samples * inner_dim;
|
||||
float * dst_ptr = f32_out_buffer[0] + i * num_frames * frame_length * inner_dim;
|
||||
|
||||
for (j = 0; j < num_frames; j++)
|
||||
{
|
||||
for (k = 0; k < frame_length; k++)
|
||||
{
|
||||
int32_t m = j * frame_step + k;
|
||||
|
||||
if (pad_end)
|
||||
{
|
||||
if (m >= length_samples)
|
||||
{
|
||||
int32_t l = 0;
|
||||
for (l = 0; l < inner_dim; l++)
|
||||
{
|
||||
(dst_ptr + (j * frame_length + k) * inner_dim)[l] = pad_val;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(dst_ptr + (j * frame_length + k) * inner_dim, src_ptr + m * inner_dim,
|
||||
inner_dim * sizeof(float));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(dst_ptr + (j * frame_length + k) * inner_dim, src_ptr + m * inner_dim,
|
||||
inner_dim * sizeof(float));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
if (f32_in_buffer[i])
|
||||
{
|
||||
free(f32_in_buffer[i]);
|
||||
f32_in_buffer[i] = NULL;
|
||||
}
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (f32_out_buffer[i])
|
||||
{
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _signal_frame_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _signal_frame_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t frame_length = vsi_nn_kernel_param_get_int32( params, "frame_length" );
|
||||
int32_t frame_step = vsi_nn_kernel_param_get_int32( params, "frame_step" );
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
|
||||
float pad_val = vsi_nn_kernel_param_get_float32( params, "pad_val" );
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
node_params[FRAME_LENGHT] = vsi_nn_kernel_scalar_create( graph, I32, &frame_length );
|
||||
node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create( graph, I32, &frame_step );
|
||||
node_params[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
|
||||
node_params[PAD_END] = vsi_nn_kernel_scalar_create( graph, I32, &pad_end );
|
||||
node_params[PAD_VAL] = vsi_nn_kernel_scalar_create( graph, F32, &pad_val );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[FRAME_LENGHT] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[AXIS] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[PAD_END] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[PAD_VAL] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( signal_frame, _setup )
|
||||
|
|
@ -0,0 +1,389 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.spatial_transformer")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _spatial_transformer_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _SPATIAL_TRANSFORMER_PARAM_NUM _cnt_of_array( _spatial_transformer_kernel_param_def )
|
||||
#define HAS_THETA_1_1 (3)
|
||||
#define HAS_THETA_1_2 (4)
|
||||
#define HAS_THETA_1_3 (5)
|
||||
#define HAS_THETA_2_1 (6)
|
||||
#define HAS_THETA_2_2 (7)
|
||||
#define HAS_THETA_2_3 (8)
|
||||
#define THETA_1_1 (9)
|
||||
#define THETA_1_2 (10)
|
||||
#define THETA_1_3 (11)
|
||||
#define THETA_2_1 (12)
|
||||
#define THETA_2_2 (13)
|
||||
#define THETA_2_3 (14)
|
||||
#define ALIGN_CORNERS (15)
|
||||
|
||||
static void _transform_affine(int32_t dst_x, int32_t dst_y, const float m[], float *src_x, float *src_y)
|
||||
{
|
||||
*src_x = dst_x * m[0] + dst_y * m[2] + m[4];
|
||||
*src_y = dst_x * m[1] + dst_y * m[3] + m[5];
|
||||
}
|
||||
|
||||
static float _read_pixel(float *base, vsi_nn_kernel_tensor_attr_t *attr,
|
||||
float x, float y, int32_t z, int32_t b)
|
||||
{
|
||||
vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= attr->shape->data[0] || y >= attr->shape->data[1]);
|
||||
int32_t bx, by;
|
||||
int32_t offset = (b * attr->shape->data[2] + z) * attr->shape->data[0] * attr->shape->data[1];
|
||||
float pixel = 0;
|
||||
|
||||
if (out_of_bounds)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
// bounded x/y
|
||||
bx = (int32_t)x;
|
||||
by = (int32_t)y;
|
||||
|
||||
pixel = base[attr->shape->data[0] * by + bx + offset];
|
||||
|
||||
return pixel;
|
||||
}
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
int32_t i = 0;
|
||||
int32_t b = 0;
|
||||
int32_t c = 0;
|
||||
int32_t j = 0;
|
||||
int32_t x = 0;
|
||||
int32_t y = 0;
|
||||
int32_t has_theta[6] = {0};
|
||||
int32_t batch = 1;
|
||||
int32_t depth = 1;
|
||||
int32_t height = 1;
|
||||
int32_t width = 1;
|
||||
int32_t input_height = 1;
|
||||
int32_t input_width = 1;
|
||||
int32_t rank = 0;
|
||||
int32_t index = 0;
|
||||
int32_t align_corners = 0;
|
||||
float theta[6] = {0};
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input buffer fail.", final );
|
||||
}
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
out_bytes[i] = out_elements[i] * sizeof(float);
|
||||
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
memset( f32_out_buffer[i], 0, out_bytes[i] );
|
||||
}
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_1], &has_theta[0]);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_2], &has_theta[1]);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_1_3], &has_theta[2]);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_1], &has_theta[3]);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_2], &has_theta[4]);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[HAS_THETA_2_3], &has_theta[5]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_1], &theta[0]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_2], &theta[1]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_3], &theta[2]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_1], &theta[3]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_2], &theta[4]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_3], &theta[5]);
|
||||
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[ALIGN_CORNERS], &align_corners);
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
rank = (int32_t)out_attr[0]->shape->size;
|
||||
width = out_attr[0]->shape->data[0];
|
||||
height = out_attr[0]->shape->data[1];
|
||||
depth = rank > 2 ? out_attr[0]->shape->data[2] : 1;
|
||||
batch = rank > 3 ? out_attr[0]->shape->data[3] : 1;
|
||||
|
||||
input_width = in_attr[0]->shape->data[0];
|
||||
input_height = in_attr[0]->shape->data[1];
|
||||
|
||||
for (b = 0; b < batch; b++)
|
||||
{
|
||||
float _w = (float)input_width;
|
||||
float _h = (float)input_height;
|
||||
float w = (float)width;
|
||||
float h = (float)height;
|
||||
float matrix_m[6] = {0};
|
||||
j = 0;
|
||||
for (i = 0; i < 6; i++)
|
||||
{
|
||||
if (has_theta[i] == 0)
|
||||
{
|
||||
theta[i] = f32_in_buffer[1][b * in_attr[1]->shape->data[0] + j];
|
||||
j ++;
|
||||
}
|
||||
}
|
||||
|
||||
if (align_corners && w > 1)
|
||||
{
|
||||
w = w - 1;
|
||||
}
|
||||
|
||||
if (align_corners && h > 1)
|
||||
{
|
||||
h = h - 1;
|
||||
}
|
||||
|
||||
matrix_m[0] = theta[4] * _w / w;
|
||||
matrix_m[2] = theta[3] * _w / h;
|
||||
matrix_m[4] = (theta[5] - theta[4] - theta[3] + 1) * _w * 0.5f;
|
||||
matrix_m[1] = theta[1] * _h / w;
|
||||
matrix_m[3] = theta[0] * _h / h;
|
||||
matrix_m[5] = (theta[2] - theta[1] - theta[0] + 1) * _h * 0.5f;
|
||||
for (c = 0; c < depth; c++)
|
||||
{
|
||||
for (y = 0; y < height; y++)
|
||||
{
|
||||
for (x = 0; x < width; x++)
|
||||
{
|
||||
float xf = 0;
|
||||
float yf = 0;
|
||||
float tl = 0, tr = 0, bl = 0, br = 0;
|
||||
float ar = 0, ab = 0, al = 0, at = 0;
|
||||
|
||||
_transform_affine(x, y, matrix_m, &xf, &yf);
|
||||
|
||||
xf = xf < 0 ? xf - 1 : xf;
|
||||
yf = yf < 0 ? yf - 1 : yf;
|
||||
ar = xf - floorf(xf);
|
||||
ab = yf - floorf(yf);
|
||||
al = 1.0f - ar;
|
||||
at = 1.0f - ab;
|
||||
|
||||
tl = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf), floorf(yf), c, b);
|
||||
tr = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf) + 1, floorf(yf), c, b);
|
||||
bl = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf), floorf(yf) + 1, c, b);
|
||||
br = _read_pixel(f32_in_buffer[0], in_attr[0], floorf(xf) + 1, floorf(yf) + 1, c, b);
|
||||
|
||||
f32_out_buffer[0][index ++] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
if (f32_in_buffer[i])
|
||||
{
|
||||
free(f32_in_buffer[i]);
|
||||
f32_in_buffer[i] = NULL;
|
||||
}
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (f32_out_buffer[i])
|
||||
{
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _spatial_transformer_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _spatial_transformer_kernel_param_def );
|
||||
|
||||
return VSI_SUCCESS;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SPATIAL_TRANSFORMER_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t has_theta_1_1 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_1" );
|
||||
int32_t has_theta_1_2 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_2" );
|
||||
int32_t has_theta_1_3 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_3" );
|
||||
int32_t has_theta_2_1 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_1" );
|
||||
int32_t has_theta_2_2 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_2" );
|
||||
int32_t has_theta_2_3 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_3" );
|
||||
float theta_1_1 = vsi_nn_kernel_param_get_float32( params, "theta_1_1" );
|
||||
float theta_1_2 = vsi_nn_kernel_param_get_float32( params, "theta_1_2" );
|
||||
float theta_1_3 = vsi_nn_kernel_param_get_float32( params, "theta_1_3" );
|
||||
float theta_2_1 = vsi_nn_kernel_param_get_float32( params, "theta_2_1" );
|
||||
float theta_2_2 = vsi_nn_kernel_param_get_float32( params, "theta_2_2" );
|
||||
float theta_2_3 = vsi_nn_kernel_param_get_float32( params, "theta_2_3" );
|
||||
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SPATIAL_TRANSFORMER_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 );
|
||||
node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 );
|
||||
node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 );
|
||||
node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 );
|
||||
node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 );
|
||||
node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 );
|
||||
node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 );
|
||||
node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 );
|
||||
node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 );
|
||||
node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 );
|
||||
node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 );
|
||||
node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 );
|
||||
node_params[ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SPATIAL_TRANSFORMER_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[ALIGN_CORNERS] );
|
||||
}
|
||||
}
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( spatial_transformer, _setup )
|
||||
|
|
@ -0,0 +1,185 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (1)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.sync_host")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _sync_host_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _SYNC_HOST_PARAM_NUM _cnt_of_array( _sync_host_kernel_param_def )
|
||||
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
void *in_buffer[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
uint32_t i = 0;
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
in_buffer[i] = vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], FALSE );
|
||||
CHECK_PTR_FAIL_GOTO( in_buffer[i], "Create input buffer fail.", final );
|
||||
}
|
||||
|
||||
for(i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
out_bytes[i] = vsi_nn_kernel_tensor_attr_get_bytes( out_attr[i] );
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write( output[i], out_attr[i],
|
||||
in_buffer[i], out_bytes[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
if (in_buffer[i])
|
||||
{
|
||||
free(in_buffer[i]);
|
||||
in_buffer[i] = NULL;
|
||||
}
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _sync_host_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _sync_host_kernel_param_def );
|
||||
status = VSI_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SYNC_HOST_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SYNC_HOST_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SYNC_HOST_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( sync_host, _setup )
|
||||
|
|
@ -0,0 +1,219 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define _INPUT_NUM (2)
|
||||
#define _OUTPUT_NUM (1)
|
||||
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.tensorstackconcat")
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _tensorstackconcat_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _TENSORSTACKCONCAT_PARAM_NUM _cnt_of_array( _tensorstackconcat_kernel_param_def )
|
||||
|
||||
|
||||
/*
|
||||
* Kernel function
|
||||
*/
|
||||
DEF_KERNEL_EXECUTOR(_compute)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
uint32_t i = 0;
|
||||
uint32_t depth = 0;
|
||||
uint32_t height = 1;
|
||||
uint32_t width = 0;
|
||||
uint32_t index = 0;
|
||||
uint32_t c = 0, y = 0, x = 0;
|
||||
|
||||
/* prepare data */
|
||||
for (i = 0; i < _INPUT_NUM; i ++)
|
||||
{
|
||||
input[i] = (vsi_nn_kernel_tensor_t)param[i];
|
||||
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
|
||||
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
|
||||
}
|
||||
|
||||
for (i = 0; i < _OUTPUT_NUM; i ++)
|
||||
{
|
||||
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
|
||||
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
|
||||
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
|
||||
f32_out_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( output[i], out_attr[i], TRUE );
|
||||
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
|
||||
}
|
||||
|
||||
depth = in_attr[0]->shape->data[2];
|
||||
height = in_attr[0]->shape->data[1];
|
||||
width = in_attr[0]->shape->data[0];
|
||||
index = (int32_t)f32_in_buffer[1][0];
|
||||
|
||||
for (c = 0; c < depth; c++)
|
||||
{
|
||||
for (y = 0; y < height; y++)
|
||||
{
|
||||
for (x = 0; x < width; x++)
|
||||
{
|
||||
int32_t i_idx = c * width * height + y * width + x;
|
||||
int32_t o_idx = (c * out_attr[0]->shape->data[1] + index ) * out_attr[0]->shape->data[0] + x;
|
||||
float value = f32_in_buffer[0][i_idx];
|
||||
|
||||
f32_out_buffer[0][o_idx] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
|
||||
f32_out_buffer[i], out_elements[i] );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _INPUT_NUM; i++)
|
||||
{
|
||||
if (f32_in_buffer[i])
|
||||
{
|
||||
free(f32_in_buffer[i]);
|
||||
f32_in_buffer[i] = NULL;
|
||||
}
|
||||
if (in_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
|
||||
}
|
||||
}
|
||||
for(i = 0; i < _OUTPUT_NUM; i++)
|
||||
{
|
||||
if (f32_out_buffer[i])
|
||||
{
|
||||
free(f32_out_buffer[i]);
|
||||
f32_out_buffer[i] = NULL;
|
||||
}
|
||||
if (out_attr[i])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _compute() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
/* Add extra params */
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
|
||||
kernel->info.function = _compute;
|
||||
kernel->info.parameters = _tensorstackconcat_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _tensorstackconcat_kernel_param_def );
|
||||
|
||||
status = VSI_SUCCESS;
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_TENSORSTACKCONCAT_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _TENSORSTACKCONCAT_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _TENSORSTACKCONCAT_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_CPU( tensorstackconcat, _setup )
|
||||
|
|
@ -79,8 +79,8 @@ DEF_KERNEL_EXECUTOR(_compute)
|
|||
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
|
||||
float *f32_in_buffer[_INPUT_NUM] = {NULL};
|
||||
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
|
||||
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
|
||||
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
|
||||
size_t out_elements[_OUTPUT_NUM] = {0};
|
||||
size_t out_bytes[_OUTPUT_NUM] = {0};
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
|
|||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * input1_attr = NULL;
|
||||
vsi_nn_kernel_tensor_attr_t * input1_attr = NULL;
|
||||
vsi_int_array_t * in_shape = NULL;
|
||||
float logE = (float)(log10(exp(1.0f)) / log10(2.0f));
|
||||
float scaleIn0 = 1.0f;
|
||||
|
|
@ -224,6 +224,7 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
|
|||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(input_attr);
|
||||
SAFE_FREE_TENSOR_ATTR(input1_attr);
|
||||
|
||||
return status;
|
||||
} /* _detect_post_box_initializer() */
|
||||
|
|
|
|||
|
|
@ -49,6 +49,8 @@ typedef enum
|
|||
UNARY_HSIGMOID,
|
||||
UNARY_MISH,
|
||||
UNARY_ROUND,
|
||||
UNARY_GELU,
|
||||
UNARY_HGELU,
|
||||
} unary_type_e;
|
||||
|
||||
/*
|
||||
|
|
@ -84,6 +86,8 @@ typedef enum
|
|||
#define HSIGMOID_OPERATION hard_sigmoid
|
||||
#define MISH_OPERATION mish
|
||||
#define ROUND_OPERATION round
|
||||
#define GELU_OPERATION gelu
|
||||
#define HGELU_OPERATION hard_gelu
|
||||
|
||||
static const struct {
|
||||
uint32_t key;
|
||||
|
|
@ -274,6 +278,42 @@ static const struct {
|
|||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, I8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_2D)
|
||||
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16, F16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16, I16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16, U8 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, F16, I8 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I16, I16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I16, F16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8, U8 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, U8, F16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I8, I8 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, I8, F16 , KERNEL_SOURCE_3D)
|
||||
TENSOR_UNARY_KERNELS(GELU_OPERATION, UNARY_GELU, BF16, BF16 , KERNEL_SOURCE_3D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, U8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, F16, I8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, I16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I16, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, U8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, U8, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, I8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, I8, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(GELU_OPERATION, UNARY_GELU, BF16, BF16 , KERNEL_SOURCE_2D)
|
||||
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, U8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, F16, I8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, I16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I16, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, U8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, U8, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, I8 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, I8, F16 , KERNEL_SOURCE_2D)
|
||||
TENSOR_UNARY_KERNELS_2D(HGELU_OPERATION, UNARY_HGELU, BF16, BF16 , KERNEL_SOURCE_2D)
|
||||
};
|
||||
|
||||
#undef SIN_OPERATION
|
||||
|
|
@ -284,6 +324,8 @@ static const struct {
|
|||
#undef HSIGMOID_OPERATION
|
||||
#undef MISH_OPERATION
|
||||
#undef ROUND_OPERATION
|
||||
#undef GELU_OPERATION
|
||||
#undef HGELU_OPERATION
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
|
|
@ -403,6 +445,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
|
|||
case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
|
||||
case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
|
||||
{
|
||||
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
|
|
@ -682,6 +726,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
|
|||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
|
||||
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
|
||||
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -0,0 +1,243 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define EXTRA_ENDING_HASH_KEY( OUT_DTYPE ) \
|
||||
( ( OUT_DTYPE ) )
|
||||
#define EXTRA_ENDING_KERNEL_MAP( OUT_DTYPE ) \
|
||||
{ EXTRA_ENDING_HASH_KEY( OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("evis.extra_ending_"#OUT_DTYPE), \
|
||||
"extra_ending" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _extra_ending_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
EXTRA_ENDING_KERNEL_MAP( F16 ),
|
||||
EXTRA_ENDING_KERNEL_MAP( I16 ),
|
||||
EXTRA_ENDING_KERNEL_MAP( U8 ),
|
||||
EXTRA_ENDING_KERNEL_MAP( I8 ),
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _extra_ending_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _EXTRA_ENDING_PARAM_NUM _cnt_of_array( _extra_ending_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_extra_ending_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * attr = NULL;
|
||||
vsi_int_array_t * out_shape = NULL;
|
||||
|
||||
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
out_shape = attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 8;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0];
|
||||
gpu_param.global_size[1] = out_shape->data[1];
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
final:
|
||||
if (attr)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr );
|
||||
attr = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _extra_ending_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
uint32_t key = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = EXTRA_ENDING_HASH_KEY( out_dtype );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(_extra_ending_kernel_map); i ++ )
|
||||
{
|
||||
if ( _extra_ending_kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(_extra_ending_kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _extra_ending_kernel_map[i].function_name );
|
||||
kernel->info.parameters = _extra_ending_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _extra_ending_kernel_param_def );
|
||||
kernel->info.initialize = _extra_ending_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
_extra_ending_kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
_extra_ending_kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_EXTRA_ENDING_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
uint32_t rank[3] = {0};
|
||||
int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
|
||||
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
|
||||
int32_t i = 0;
|
||||
|
||||
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
|
||||
shapes[0], &rank[0]);
|
||||
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
|
||||
shapes[1], &rank[1]);
|
||||
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num,
|
||||
shapes[2], &rank[2]);
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[i], (uint32_t*)shapes[i], rank[i] );
|
||||
}
|
||||
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], (uint32_t*)shapes[2], rank[2] );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
|
||||
inputs[0]->attr.dim_num ) )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
vx_border_t border;
|
||||
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = 0;
|
||||
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _EXTRA_ENDING_PARAM_NUM,
|
||||
reshape_tensors, input_num, &reshape_tensors[2], output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _EXTRA_ENDING_PARAM_NUM );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( extra_ending, _setup )
|
||||
|
|
@ -991,8 +991,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
|
|||
int32_t _is_ln = 0;
|
||||
int32_t _is_cifg = 0;
|
||||
int32_t _is_hybrid = 0;
|
||||
vsi_nn_kernel_tensor_attr_t* input_attr[9];
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2];
|
||||
vsi_nn_kernel_tensor_attr_t* input_attr[9] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL};;
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
|
|
|||
|
|
@ -44,6 +44,8 @@ __BEGIN_DECLS
|
|||
#define KERNEL_SOURCE_3 "moments_axis2"
|
||||
#define KERNEL_SOURCE_4 "moments_axis01"
|
||||
#define KERNEL_SOURCE_5 "moments_axis012"
|
||||
#define KERNEL_SOURCE_6 "moments_u8"
|
||||
#define KERNEL_SOURCE_7 "moments_u8_axis012"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define HASH_MOMENTS_KEY(_input0_type, _output_type, _axis_num, _axis0, _axis1, _axis2, _image_2d) \
|
||||
|
|
@ -107,14 +109,19 @@ static const struct {
|
|||
TENSOR_MOMENTS_KERNELS(I8, F16, 2, KERNEL_SOURCE_3)
|
||||
TENSOR_MOMENTS_KERNELS(I16, F16, 2, KERNEL_SOURCE_3)
|
||||
TENSOR_MOMENTS_KERNELS(F16, F16, 2, KERNEL_SOURCE_3)
|
||||
TENSOR_MOMENTS_KERNELS(U8, U8, 0, KERNEL_SOURCE_6)
|
||||
TENSOR_MOMENTS_KERNELS(U8, U8, 1, KERNEL_SOURCE_6)
|
||||
TENSOR_MOMENTS_KERNELS(U8, U8, 2, KERNEL_SOURCE_6)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F16, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS(I8, F16, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS(I16, F16, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, U8, 0, 1, KERNEL_SOURCE_6)
|
||||
TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F16, 0, 1, 2, KERNEL_SOURCE_5)
|
||||
TENSOR_MOMENTS_THREE_AXIS_KERNELS(I8, F16, 0, 1, 2, KERNEL_SOURCE_5)
|
||||
TENSOR_MOMENTS_THREE_AXIS_KERNELS(I16, F16, 0, 1, 2, KERNEL_SOURCE_5)
|
||||
TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2, KERNEL_SOURCE_5)
|
||||
TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, U8, 0, 1, 2, KERNEL_SOURCE_7)
|
||||
TENSOR_MOMENTS_KERNELS_2D(U8, F16, 0, KERNEL_SOURCE_1)
|
||||
TENSOR_MOMENTS_KERNELS_2D(I8, F16, 0, KERNEL_SOURCE_1)
|
||||
TENSOR_MOMENTS_KERNELS_2D(I16, F16, 0, KERNEL_SOURCE_1)
|
||||
|
|
@ -123,10 +130,13 @@ static const struct {
|
|||
TENSOR_MOMENTS_KERNELS_2D(I8, F16, 1, KERNEL_SOURCE_2)
|
||||
TENSOR_MOMENTS_KERNELS_2D(I16, F16, 1, KERNEL_SOURCE_2)
|
||||
TENSOR_MOMENTS_KERNELS_2D(F16, F16, 1, KERNEL_SOURCE_2)
|
||||
TENSOR_MOMENTS_KERNELS_2D(U8, U8, 0, KERNEL_SOURCE_6)
|
||||
TENSOR_MOMENTS_KERNELS_2D(U8, U8, 1, KERNEL_SOURCE_6)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8, F16, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I8, F16, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I16, F16, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(F16, F16, 0, 1, KERNEL_SOURCE_4)
|
||||
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8, U8, 0, 1, KERNEL_SOURCE_6)
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -179,31 +189,41 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
|
||||
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL, NULL};
|
||||
vsi_int_array_t * input_shape = NULL;
|
||||
float scaleIn = 0;
|
||||
int32_t input_zp = 0;
|
||||
vx_uint32 iter = 0;
|
||||
int32_t sumInZp = 0;
|
||||
int32_t tmpZp1 = 0;
|
||||
float tmpZp2 = 0;
|
||||
float e2InScale = 0;
|
||||
float rowSumScale = 0;
|
||||
int32_t axis = 0;
|
||||
int32_t axis_num = 0;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
int32_t chn = 0;
|
||||
float dimRatio = 1.0;
|
||||
int32_t iterSize = 16;
|
||||
float zpScaleSqr_i16 = 0.0f;
|
||||
float zpScale2_i16 = 0.0f;
|
||||
float sumScale_i16 = 0.0f;
|
||||
float scaleIn = 0;
|
||||
int32_t input_zp = 0;
|
||||
vx_uint32 iter = 0;
|
||||
int32_t sumInZp = 0;
|
||||
int32_t tmpZp1 = 0;
|
||||
float tmpZp2 = 0;
|
||||
float e2InScale = 0;
|
||||
float rowSumScale = 0;
|
||||
int32_t axis = 0;
|
||||
int32_t axis_num = 0;
|
||||
int32_t width = 0;
|
||||
int32_t height = 0;
|
||||
int32_t chn = 0;
|
||||
float dimRatio = 1.0;
|
||||
int32_t iterSize = 16;
|
||||
float zpScaleSqr_i16 = 0.0f;
|
||||
float zpScale2_i16 = 0.0f;
|
||||
float sumScale_i16 = 0.0f;
|
||||
float output_ZP[4] = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
float outputScale[4] = {1.0f, 1.0f, 1.0f, 1.0f};
|
||||
float output_ZP0 = 0.0f;
|
||||
float outputScale0 = 1;
|
||||
float output_ZP1 = 0.0f;
|
||||
float outputScale1 = 1.0f;
|
||||
|
||||
uint32_t pack_key = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
|
||||
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
|
@ -212,10 +232,13 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
|
||||
input_shape = attr[0]->shape;
|
||||
input_zp = attr[0]->asymm.zero_point;
|
||||
scaleIn = attr[0]->asymm.scale;
|
||||
|
||||
if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
input_zp = attr[0]->asymm.zero_point;
|
||||
scaleIn = attr[0]->asymm.scale;
|
||||
}
|
||||
else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[0]->dfp.fl > 0)
|
||||
{
|
||||
|
|
@ -234,6 +257,57 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
scaleIn = 1;
|
||||
}
|
||||
|
||||
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
output_ZP0 = (float)attr[1]->asymm.zero_point;
|
||||
outputScale0 = 1.0f / attr[1]->asymm.scale;
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[1]->dfp.fl > 0)
|
||||
{
|
||||
outputScale0 = (float)((int64_t)1 << attr[1]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale0 = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
|
||||
}
|
||||
output_ZP0 = 0.0f;
|
||||
}
|
||||
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
outputScale0 = 1.0f;
|
||||
output_ZP0 = 0.0f;
|
||||
}
|
||||
|
||||
if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
output_ZP1 = (float)attr[2]->asymm.zero_point;
|
||||
outputScale1 = 1.0f / attr[2]->asymm.scale;
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
if (attr[2]->dfp.fl > 0)
|
||||
{
|
||||
outputScale1 = (float)((int64_t)1 << attr[2]->dfp.fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputScale1 = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
|
||||
}
|
||||
output_ZP1 = 0.0f;
|
||||
}
|
||||
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
|
||||
{
|
||||
outputScale1 = 1.0f;
|
||||
output_ZP1 = 0.0f;
|
||||
}
|
||||
|
||||
output_ZP[0] = output_ZP0;
|
||||
output_ZP[1] = output_ZP1;
|
||||
outputScale[0] = outputScale0;
|
||||
outputScale[1] = outputScale1;
|
||||
|
||||
if(attr[0]->dtype == I16)
|
||||
{
|
||||
iterSize = 8;
|
||||
|
|
@ -316,10 +390,10 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
zpScale2_i16 = tmpZp1 * e2InScale;
|
||||
sumScale_i16 = sumInZp * scaleIn;
|
||||
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, AXIS_NUM, FIRST_AXIS ) \
|
||||
(IN0_TYPE | (AXIS_NUM << 8) | (FIRST_AXIS << 16))
|
||||
#define _PACK_SELECT_KEY( IN0_TYPE, OUT0_TYPE, AXIS_NUM, FIRST_AXIS ) \
|
||||
(IN0_TYPE | (OUT0_TYPE << 8) | (AXIS_NUM << 16) | (FIRST_AXIS << 24))
|
||||
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, axis_num, axis);
|
||||
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, axis_num, axis);
|
||||
|
||||
{
|
||||
gpu_dp_inst_t uniSumU8_16x1 = {{
|
||||
|
|
@ -377,11 +451,22 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
switch( pack_key )
|
||||
{
|
||||
case _PACK_SELECT_KEY( U8, 1, 0):
|
||||
case _PACK_SELECT_KEY( I8, 1, 0):
|
||||
case _PACK_SELECT_KEY( I16, 1, 0):
|
||||
case _PACK_SELECT_KEY( U8, F16, 1, 0):
|
||||
case _PACK_SELECT_KEY( I8, F16, 1, 0):
|
||||
case _PACK_SELECT_KEY( I16, F16, 1, 0):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
|
||||
|
|
@ -395,22 +480,28 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, 1, 0):
|
||||
case _PACK_SELECT_KEY( F16, F16, 1, 0):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, 1, 1):
|
||||
case _PACK_SELECT_KEY( I8, 1, 1):
|
||||
case _PACK_SELECT_KEY( I16, 1, 1):
|
||||
case _PACK_SELECT_KEY( U8, F16, 1, 1):
|
||||
case _PACK_SELECT_KEY( I8, F16, 1, 1):
|
||||
case _PACK_SELECT_KEY( I16, F16, 1, 1):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
|
||||
&uniConvert1stUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
|
|
@ -418,19 +509,23 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, 1, 1):
|
||||
case _PACK_SELECT_KEY( F16, F16, 1, 1):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, 1, 2):
|
||||
case _PACK_SELECT_KEY( I8, 1, 2):
|
||||
case _PACK_SELECT_KEY( I16, 1, 2):
|
||||
case _PACK_SELECT_KEY( U8, F16, 1, 2):
|
||||
case _PACK_SELECT_KEY( I8, F16, 1, 2):
|
||||
case _PACK_SELECT_KEY( I16, F16, 1, 2):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
|
||||
&uniConvert1stUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
|
|
@ -438,16 +533,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, 1, 2):
|
||||
case _PACK_SELECT_KEY( F16, F16, 1, 2):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, 2, 0):
|
||||
case _PACK_SELECT_KEY( I8, 2, 0):
|
||||
case _PACK_SELECT_KEY( I16, 2, 0):
|
||||
case _PACK_SELECT_KEY( U8, F16, 2, 0):
|
||||
case _PACK_SELECT_KEY( I8, F16, 2, 0):
|
||||
case _PACK_SELECT_KEY( I16, F16, 2, 0):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
|
||||
|
|
@ -462,12 +559,14 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, 3, 0):
|
||||
case _PACK_SELECT_KEY( I8, 3, 0):
|
||||
case _PACK_SELECT_KEY( I16, 3, 0):
|
||||
case _PACK_SELECT_KEY( U8, F16, 3, 0):
|
||||
case _PACK_SELECT_KEY( I8, F16, 3, 0):
|
||||
case _PACK_SELECT_KEY( I16, F16, 3, 0):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
|
||||
|
|
@ -483,32 +582,85 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
|
|||
status |= vsi_nn_kernel_gpu_add_param(node, "zpScale2_i16", &zpScale2_i16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "sumScale_i16", &sumScale_i16);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, 2, 0):
|
||||
case _PACK_SELECT_KEY( F16, F16, 2, 0):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( F16, 3, 0):
|
||||
case _PACK_SELECT_KEY( F16, F16, 3, 0):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8",
|
||||
&uniConvertHalftoFp16_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, U8, 1, 0):
|
||||
case _PACK_SELECT_KEY( U8, U8, 1, 1):
|
||||
case _PACK_SELECT_KEY( U8, U8, 1, 2):
|
||||
case _PACK_SELECT_KEY( U8, U8, 2, 0):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
|
||||
&uniConvertInt32toUint8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
|
||||
&uniConvert1stUint8SubZpToFp32_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_ZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP0", &output_ZP0);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale0", &outputScale0);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP1", &output_ZP1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale1", &outputScale1);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
case _PACK_SELECT_KEY( U8, U8, 3, 0):
|
||||
{
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
|
||||
&uniConvertInt32toUint8_2x8);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_ZP);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &outputScale);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
break;
|
||||
default:
|
||||
VSI_ASSERT( FALSE );
|
||||
break;
|
||||
}
|
||||
status = vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
|
||||
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8);
|
||||
CHECK_STATUS_FAIL_GOTO(status, OnError );
|
||||
}
|
||||
#undef _PACK_SELECT_KEY
|
||||
|
|
@ -519,6 +671,16 @@ OnError:
|
|||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
if (attr[2])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[2] );
|
||||
attr[2] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,292 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
#define SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
|
||||
( ( IN_DTYPE << 8 ) | ( OUT_DTYPE ) )
|
||||
#define SIGNAL_FRAME_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
|
||||
{ SIGNAL_FRAME_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("evis.signal_frame_"#IN_DTYPE"to"#OUT_DTYPE), \
|
||||
"signal_frame" }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _signal_frame_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
SIGNAL_FRAME_KERNEL_MAP( I16, I16 ),
|
||||
SIGNAL_FRAME_KERNEL_MAP( F16, F16 ),
|
||||
SIGNAL_FRAME_KERNEL_MAP( BF16, BF16 ),
|
||||
SIGNAL_FRAME_KERNEL_MAP( U8, U8 ),
|
||||
SIGNAL_FRAME_KERNEL_MAP( I8, I8 ),
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _signal_frame_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _SIGNAL_FRAME_PARAM_NUM _cnt_of_array( _signal_frame_kernel_param_def )
|
||||
#define FRAME_STEP (2)
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_signal_frame_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * attr = NULL;
|
||||
vsi_int_array_t * out_shape = NULL;
|
||||
|
||||
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
|
||||
out_shape = attr->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 16;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
if ( attr->dtype == F16 || attr->dtype == I16 || attr->dtype == U16 || attr->dtype == BF16)
|
||||
{
|
||||
gpu_param.global_scale[0] = 8;
|
||||
}
|
||||
gpu_param.global_size[0] = (out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0];
|
||||
gpu_param.global_size[1] = out_shape->data[1];
|
||||
gpu_param.global_size[2] = out_shape->data[2];
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
final:
|
||||
if (attr)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr );
|
||||
attr = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _signal_frame_initializer() */
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
uint32_t key = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = SIGNAL_FRAME_HASH_KEY( in_dtype, out_dtype );
|
||||
|
||||
for ( i = 0; i < _cnt_of_array(_signal_frame_kernel_map); i ++ )
|
||||
{
|
||||
if ( _signal_frame_kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < _cnt_of_array(_signal_frame_kernel_map) )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _signal_frame_kernel_map[i].function_name );
|
||||
kernel->info.parameters = _signal_frame_kernel_param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _signal_frame_kernel_param_def );
|
||||
kernel->info.initialize = _signal_frame_initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
_signal_frame_kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
_signal_frame_kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_SIGNAL_FRAME_PARAM_NUM] = {NULL};
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
int32_t frame_length = vsi_nn_kernel_param_get_int32( params, "frame_length" );
|
||||
int32_t frame_step = vsi_nn_kernel_param_get_int32( params, "frame_step" );
|
||||
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
|
||||
int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
|
||||
float pad_value = vsi_nn_kernel_param_get_float32( params, "pad_val" );
|
||||
int32_t num_frames = outputs[0]->attr.size[axis + 1];
|
||||
int32_t rank = inputs[0]->attr.dim_num;
|
||||
int32_t inner = 1;
|
||||
int32_t outer = 1;
|
||||
int32_t length_samples = inputs[0]->attr.size[axis];
|
||||
int32_t i = 0;
|
||||
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
|
||||
int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
|
||||
|
||||
for (i = 0; i < axis; i++)
|
||||
{
|
||||
inner *= inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
for (i = axis + 1; i < rank; i++)
|
||||
{
|
||||
outer *= inputs[0]->attr.size[i];
|
||||
}
|
||||
|
||||
shape[0][0] = inner;
|
||||
shape[0][1] = length_samples;
|
||||
shape[0][2] = 1;
|
||||
shape[0][3] = outer;
|
||||
|
||||
shape[1][0] = inner;
|
||||
shape[1][1] = frame_length;
|
||||
shape[1][2] = num_frames;
|
||||
shape[1][3] = outer;
|
||||
|
||||
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
|
||||
inputs[0], (uint32_t*)shape[0], 4 );
|
||||
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
outputs[0], (uint32_t*)shape[1], 4 );
|
||||
|
||||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
|
||||
rs_tensors[1]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
status = _query_kernel( kernel, inputs, outputs );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
if ( pad_end )
|
||||
{
|
||||
// Set default border mode.
|
||||
vx_border_t border;
|
||||
uint32_t data = 0;
|
||||
uint32_t dsize = 1;
|
||||
|
||||
vsi_nn_Float32ToDtype(pad_value, (uint8_t*)&data, &outputs[0]->attr.dtype);
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
dsize = vsi_nn_GetTypeBytes( inputs[0]->attr.dtype.vx_type );
|
||||
if ( dsize == 1 )
|
||||
{
|
||||
border.constant_value.U8 = (uint8_t)data;
|
||||
}
|
||||
else if ( dsize == 4 )
|
||||
{
|
||||
border.constant_value.U32 = data;
|
||||
}
|
||||
else
|
||||
{
|
||||
border.constant_value.U16 = (uint16_t)data;
|
||||
}
|
||||
|
||||
status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
|
||||
}
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _SIGNAL_FRAME_PARAM_NUM,
|
||||
&rs_tensors[0], input_num, &rs_tensors[1], output_num );
|
||||
node_params[FRAME_STEP] = vsi_nn_kernel_scalar_create(
|
||||
graph, I32, &frame_step );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _SIGNAL_FRAME_PARAM_NUM );
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
}
|
||||
}
|
||||
final:
|
||||
if (rs_tensors[0])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[0] );
|
||||
}
|
||||
|
||||
if (rs_tensors[1])
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &rs_tensors[1] );
|
||||
}
|
||||
|
||||
if (node_params[FRAME_STEP])
|
||||
{
|
||||
vsi_nn_kernel_scalar_release( &node_params[FRAME_STEP] );
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( signal_frame, _setup )
|
||||
|
|
@ -22,7 +22,6 @@
|
|||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -40,7 +39,6 @@
|
|||
|
||||
__BEGIN_DECLS
|
||||
|
||||
|
||||
#define _SLICE_KERNEL_SOURCE "slice"
|
||||
#define _SLICE_KERNEL_NAME CVIVANTE_NAMESPACE("evis.slice")
|
||||
|
||||
|
|
@ -379,7 +377,6 @@ static vsi_status _query_kernel
|
|||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
|
|
@ -421,7 +418,7 @@ static vsi_nn_kernel_node_t _setup
|
|||
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
|
||||
reshape_tensors[0]->attr.dim_num ) || input_batch != output_batch )
|
||||
{
|
||||
return NULL;
|
||||
goto final;
|
||||
}
|
||||
|
||||
image_2d = (rank[0] < 3 || shapes[0][2] == 1);
|
||||
|
|
@ -443,6 +440,12 @@ static vsi_nn_kernel_node_t _setup
|
|||
}
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < _IO_NUM; i++)
|
||||
{
|
||||
vsi_safe_release_tensor(reshape_tensors[i]);
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,641 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
typedef enum
|
||||
{
|
||||
INTERNAL_KERNEL_GET_MATRIX,
|
||||
INTERNAL_KERNEL_WARP_AFFINE,
|
||||
} _internal_kernel_e;
|
||||
|
||||
#define _GET_MATRIX_SOURCE "get_matrix"
|
||||
#define _WARP_AFFINE_SOURCE "warp_affine"
|
||||
|
||||
// Add kernel hashtable here
|
||||
#define GET_MATRIX_HASH_KEY( IN1_DTYPE, OUT_DTYPE ) \
|
||||
(( IN1_DTYPE << 8 ) | ( OUT_DTYPE ))
|
||||
#define GET_MATRIX_KERNEL_MAP( IN1_DTYPE, OUT_DTYPE ) \
|
||||
{ GET_MATRIX_HASH_KEY( IN1_DTYPE, OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("evis.get_matrix_"#IN1_DTYPE"toF32"), \
|
||||
_GET_MATRIX_SOURCE }
|
||||
|
||||
#define WARP_AFFINE_HASH_KEY( IN0_DTYPE, OUT_DTYPE ) \
|
||||
(( IN0_DTYPE << 8 ) | ( OUT_DTYPE ))
|
||||
#define WARP_AFFINE_KERNEL_MAP( IN0_DTYPE, OUT_DTYPE ) \
|
||||
{ WARP_AFFINE_HASH_KEY( IN0_DTYPE, OUT_DTYPE ), \
|
||||
CVIVANTE_NAMESPACE("evis.warp_affine_"#IN0_DTYPE"to"#OUT_DTYPE), \
|
||||
_WARP_AFFINE_SOURCE }
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _get_matrix_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
GET_MATRIX_KERNEL_MAP( F16, F32 ),
|
||||
GET_MATRIX_KERNEL_MAP( I16, F32 ),
|
||||
GET_MATRIX_KERNEL_MAP( U8, F32 ),
|
||||
GET_MATRIX_KERNEL_MAP( I8, F32 ),
|
||||
};
|
||||
|
||||
static const _kernel_map_type _warp_affine_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
WARP_AFFINE_KERNEL_MAP( F16, F16 ),
|
||||
WARP_AFFINE_KERNEL_MAP( U8, U8 ),
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _get_matrix_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _GET_MATRIX_PARAM_NUM _cnt_of_array( _get_matrix_kernel_param_def )
|
||||
#define HAS_THETA_1_1 (2)
|
||||
#define HAS_THETA_1_2 (3)
|
||||
#define HAS_THETA_1_3 (4)
|
||||
#define HAS_THETA_2_1 (5)
|
||||
#define HAS_THETA_2_2 (6)
|
||||
#define HAS_THETA_2_3 (7)
|
||||
#define THETA_1_1 (8)
|
||||
#define THETA_1_2 (9)
|
||||
#define THETA_1_3 (10)
|
||||
#define THETA_2_1 (11)
|
||||
#define THETA_2_2 (12)
|
||||
#define THETA_2_3 (13)
|
||||
#define I_WIDTH (14)
|
||||
#define I_HEIGHT (15)
|
||||
#define O_WIDTH (16)
|
||||
#define O_HEIGHT (17)
|
||||
|
||||
static vx_param_description_t _warp_affine_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
// Add kererl parameters here
|
||||
};
|
||||
#define _WARP_AFFINE_PARAM_NUM _cnt_of_array( _warp_affine_kernel_param_def )
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_get_matrix_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
2,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * attr = NULL;
|
||||
vsi_int_array_t * in_shape = NULL;
|
||||
float theta[8] = {0};
|
||||
float input_scale = 1.0f;
|
||||
float input_tail = 0;
|
||||
float input_w = 1.0f;
|
||||
float input_h = 1.0f;
|
||||
float output_w = 1.0f;
|
||||
float output_h = 1.0f;
|
||||
float scale[4] = {0};
|
||||
|
||||
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
|
||||
if ( attr->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input_scale = attr->asymm.scale;
|
||||
input_tail = 0 - attr->asymm.zero_point * input_scale;
|
||||
}
|
||||
|
||||
in_shape = attr->shape;
|
||||
|
||||
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_1], &theta[0]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_2], &theta[1]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_1_3], &theta[2]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_1], &theta[4]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_2], &theta[5]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[THETA_2_3], &theta[6]);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[I_WIDTH], &input_w);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[I_HEIGHT], &input_h);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[O_WIDTH], &output_w);
|
||||
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[O_HEIGHT], &output_h);
|
||||
CHECK_STATUS_FAIL_GOTO( status, final );
|
||||
|
||||
scale[0] = input_w / output_w;
|
||||
scale[1] = input_h / output_h;
|
||||
scale[2] = input_w / output_h;
|
||||
scale[3] = input_h / output_w;
|
||||
|
||||
gpu_param.global_scale[0] = 1;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_size[0] = 1;
|
||||
gpu_param.global_size[1] = in_shape->data[1];
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node,
|
||||
"theta_1", &theta[0] );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"theta_2", &theta[4] );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"scale", &scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"input_scale", &input_scale );
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"input_tail", &input_tail );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
final:
|
||||
if (attr)
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr );
|
||||
attr = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _get_matrix_initializer() */
|
||||
|
||||
DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * attr[2] = {NULL};
|
||||
vsi_int_array_t * out_shape = NULL;
|
||||
float input_scale = 1.0f;
|
||||
float input_tail = 0;
|
||||
float output_scale = 1.0f;
|
||||
float output_zp = 0;
|
||||
|
||||
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
|
||||
|
||||
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
|
||||
{
|
||||
int32_t fl = attr[0]->dfp.fl;
|
||||
if (fl > 0)
|
||||
{
|
||||
input_scale = 1.0f / (float) ((int64_t)1 << fl);
|
||||
}
|
||||
else
|
||||
{
|
||||
input_scale = (float)((int64_t)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
|
||||
{
|
||||
input_scale = attr[0]->asymm.scale;
|
||||
input_tail = 0 - attr[0]->asymm.zero_point * input_scale;
|
||||
}
|
||||
|
||||
if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
|
||||
{
|
||||
int32_t fl = attr[1]->dfp.fl;
|
||||
|
||||
if (fl >= 0)
|
||||
{
|
||||
output_scale = (vx_float32) ((vx_int64)1 << fl);
|
||||
}
|
||||
else if (fl < 0)
|
||||
{
|
||||
output_scale = 1.0f / (vx_float32) ((vx_int64)1 << -fl);
|
||||
}
|
||||
}
|
||||
else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
|
||||
{
|
||||
output_scale = 1.0f / attr[1]->asymm.scale;;
|
||||
output_zp = (float)attr[1]->asymm.zero_point;
|
||||
}
|
||||
|
||||
out_shape = attr[1]->shape;
|
||||
|
||||
gpu_param.global_scale[0] = 2;
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = out_shape->data[1];
|
||||
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
|
||||
|
||||
do
|
||||
{
|
||||
gpu_dp_inst_t uniConvertDatatoF32_0_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01010000, // ASelt
|
||||
0x00010000, 0x00010000, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniConvertDatatoF32_1_4x4 = {{
|
||||
0x01010101, // TCfg
|
||||
0x01010000, // ASelt
|
||||
0x00030002, 0x00030002, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16};
|
||||
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
|
||||
0x11111111, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x06040200, 0x06040200, // ABin
|
||||
0x22222222, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
|
||||
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
gpu_dp_inst_t uniExtractInteger_2x8 = {{
|
||||
0x33333333, // TCfg
|
||||
0x11110000, // ASelt
|
||||
0x03020100, 0x03020100, // ABin
|
||||
0x00000000, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00002400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
}, GPU_DP_TYPE_16 };
|
||||
|
||||
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_0_4x4", &uniConvertDatatoF32_0_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertDatatoF32_1_4x4", &uniConvertDatatoF32_1_4x4);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "input_tail", &input_tail);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
|
||||
status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp);
|
||||
if (attr[1]->dtype == F16)
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtract8Data_2x8", &uniExtractHalf8_2x8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= vsi_nn_kernel_gpu_add_param( node,
|
||||
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
|
||||
}
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
}while(0);
|
||||
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
CHECK_STATUS_FAIL_GOTO(status, final );
|
||||
final:
|
||||
if (attr[0])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[0] );
|
||||
attr[0] = NULL;
|
||||
}
|
||||
|
||||
if (attr[1])
|
||||
{
|
||||
vsi_nn_kernel_tensor_attr_release( &attr[1] );
|
||||
attr[1] = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _warp_affine_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
const uint32_t hashkey,
|
||||
_internal_kernel_e kernel_id
|
||||
)
|
||||
{
|
||||
vx_kernel_initialize_f initializer = NULL;
|
||||
vx_param_description_t * param_def;
|
||||
vsi_status status = VSI_FAILURE;
|
||||
const _kernel_map_type* kernel_map;
|
||||
size_t kernel_map_size;
|
||||
size_t param_size;
|
||||
uint32_t i;
|
||||
|
||||
switch( kernel_id )
|
||||
{
|
||||
case INTERNAL_KERNEL_GET_MATRIX:
|
||||
initializer = _get_matrix_initializer;
|
||||
kernel_map = _get_matrix_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _get_matrix_kernel_map );
|
||||
param_def = _get_matrix_kernel_param_def;
|
||||
param_size = _GET_MATRIX_PARAM_NUM;
|
||||
break;
|
||||
case INTERNAL_KERNEL_WARP_AFFINE:
|
||||
initializer = _warp_affine_initializer;
|
||||
kernel_map = _warp_affine_kernel_map;
|
||||
kernel_map_size = _cnt_of_array( _warp_affine_kernel_map );
|
||||
param_def = _warp_affine_kernel_param_def;
|
||||
param_size = _WARP_AFFINE_PARAM_NUM;
|
||||
break;
|
||||
default:
|
||||
VSI_ASSERT( FALSE );
|
||||
return VSI_FAILURE;
|
||||
}
|
||||
|
||||
for( i = 0; i < kernel_map_size; i ++ )
|
||||
{
|
||||
if( kernel_map[i].key == hashkey )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( i < kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = (uint32_t)param_size;
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
#define INTERNAL_KERNEL_SIZE (2)
|
||||
#define MATRIX_INDEX (0)
|
||||
#define WARP_AFFINE_INDEX (1)
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_GET_MATRIX_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_param_t warp_affine_node_params[_WARP_AFFINE_PARAM_NUM] = { NULL };
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_nn_kernel_dtype_e in0_dtype;
|
||||
vsi_nn_kernel_dtype_e in1_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
|
||||
vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
|
||||
vsi_nn_tensor_t * warp_affine_tensors[2] = {NULL};
|
||||
uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
|
||||
int32_t has_theta_1_1 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_1" );
|
||||
int32_t has_theta_1_2 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_2" );
|
||||
int32_t has_theta_1_3 = vsi_nn_kernel_param_get_int32( params, "has_theta_1_3" );
|
||||
int32_t has_theta_2_1 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_1" );
|
||||
int32_t has_theta_2_2 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_2" );
|
||||
int32_t has_theta_2_3 = vsi_nn_kernel_param_get_int32( params, "has_theta_2_3" );
|
||||
float theta_1_1 = vsi_nn_kernel_param_get_float32( params, "theta_1_1" );
|
||||
float theta_1_2 = vsi_nn_kernel_param_get_float32( params, "theta_1_2" );
|
||||
float theta_1_3 = vsi_nn_kernel_param_get_float32( params, "theta_1_3" );
|
||||
float theta_2_1 = vsi_nn_kernel_param_get_float32( params, "theta_2_1" );
|
||||
float theta_2_2 = vsi_nn_kernel_param_get_float32( params, "theta_2_2" );
|
||||
float theta_2_3 = vsi_nn_kernel_param_get_float32( params, "theta_2_3" );
|
||||
int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" );
|
||||
float input_w = (float)inputs[0]->attr.size[0];
|
||||
float input_h = (float)inputs[0]->attr.size[1];
|
||||
float output_w = (float)outputs[0]->attr.size[0];
|
||||
float output_h = (float)outputs[0]->attr.size[1];
|
||||
int32_t i = 0;
|
||||
|
||||
if (align_corners && output_w > 1)
|
||||
{
|
||||
output_w = output_w - 1;
|
||||
}
|
||||
|
||||
if (align_corners && output_h > 1)
|
||||
{
|
||||
output_h = output_h - 1;
|
||||
}
|
||||
|
||||
// Check if gpu can support the size
|
||||
if( !vsi_nn_kernel_gpu_check_shape(
|
||||
(int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
|
||||
{
|
||||
ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
|
||||
// Assign unique_id
|
||||
ikernels[i]->unique_id = kernel->unique_id;
|
||||
}
|
||||
|
||||
memcpy( &attr, &(inputs[1]->attr), sizeof(vsi_nn_tensor_attr_t) );
|
||||
attr.size[0] = 16;
|
||||
attr.dim_num = 2;
|
||||
attr.dtype.vx_type = VSI_NN_TYPE_UINT16;
|
||||
attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
|
||||
attr.is_const = FALSE;
|
||||
attr.vtl = TRUE;
|
||||
tensors[0] = vsi_nn_CreateTensor( graph, &attr );
|
||||
|
||||
attr.size[3] = attr.size[1];
|
||||
attr.size[2] = attr.size[1] = 1;
|
||||
attr.dim_num = inputs[0]->attr.dim_num;
|
||||
tensors[1] = vsi_nn_reshape_tensor( graph,
|
||||
tensors[0], (uint32_t*)attr.size, attr.dim_num );
|
||||
|
||||
warp_affine_tensors[0] = inputs[0];
|
||||
warp_affine_tensors[1] = tensors[1];
|
||||
|
||||
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
hashkeys[MATRIX_INDEX]= GET_MATRIX_HASH_KEY( in1_dtype, F32 );
|
||||
hashkeys[WARP_AFFINE_INDEX] = WARP_AFFINE_HASH_KEY( in0_dtype, out_dtype );
|
||||
|
||||
status = _query_kernel( ikernels[MATRIX_INDEX], hashkeys[MATRIX_INDEX], INTERNAL_KERNEL_GET_MATRIX );
|
||||
if( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
status = _query_kernel( ikernels[WARP_AFFINE_INDEX], hashkeys[WARP_AFFINE_INDEX], INTERNAL_KERNEL_WARP_AFFINE );
|
||||
if( VSI_SUCCESS != status )
|
||||
{
|
||||
goto final;
|
||||
}
|
||||
|
||||
// Get Matrix
|
||||
node = vsi_nn_kernel_create_node( graph, ikernels[MATRIX_INDEX] );
|
||||
vsi_nn_kernel_node_pack_io( node_params, _GET_MATRIX_PARAM_NUM,
|
||||
&inputs[1], 1, &tensors[0], 1 );
|
||||
node_params[HAS_THETA_1_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_1 );
|
||||
node_params[HAS_THETA_1_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_2 );
|
||||
node_params[HAS_THETA_1_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_1_3 );
|
||||
node_params[HAS_THETA_2_1] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_1 );
|
||||
node_params[HAS_THETA_2_2] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_2 );
|
||||
node_params[HAS_THETA_2_3] = vsi_nn_kernel_scalar_create( graph, I32, &has_theta_2_3 );
|
||||
node_params[THETA_1_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_1 );
|
||||
node_params[THETA_1_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_2 );
|
||||
node_params[THETA_1_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_1_3 );
|
||||
node_params[THETA_2_1] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_1 );
|
||||
node_params[THETA_2_2] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_2 );
|
||||
node_params[THETA_2_3] = vsi_nn_kernel_scalar_create( graph, F32, &theta_2_3 );
|
||||
node_params[I_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &input_w );
|
||||
node_params[I_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &input_h );
|
||||
node_params[O_WIDTH] = vsi_nn_kernel_scalar_create( graph, F32, &output_w );
|
||||
node_params[O_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &output_h );
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _GET_MATRIX_PARAM_NUM );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_1] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_1_3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_1] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[HAS_THETA_2_3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_1_1] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_1_2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_1_3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_2_1] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_2_2] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[THETA_2_3] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[I_WIDTH] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[I_HEIGHT] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[O_WIDTH] );
|
||||
vsi_nn_kernel_scalar_release( &node_params[O_HEIGHT] );
|
||||
vsi_nn_kernel_node_release( &node );
|
||||
|
||||
// Warp Affine
|
||||
node = vsi_nn_kernel_create_node( graph, ikernels[WARP_AFFINE_INDEX] );
|
||||
if (node)
|
||||
{
|
||||
vx_border_t border;
|
||||
border.mode = VX_BORDER_CONSTANT;
|
||||
border.constant_value.U32 = 0;
|
||||
border.constant_value.S16 = 0;
|
||||
border.constant_value.U8 = 0;
|
||||
if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 &&
|
||||
inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
|
||||
{
|
||||
border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
|
||||
}
|
||||
status = vsi_nn_kernel_node_set_border( node, &border );
|
||||
VSI_ASSERT( status == VSI_SUCCESS );
|
||||
}
|
||||
vsi_nn_kernel_node_pack_io( warp_affine_node_params, _WARP_AFFINE_PARAM_NUM,
|
||||
warp_affine_tensors, 2, outputs, 1 );
|
||||
status = vsi_nn_kernel_node_pass_param( node, warp_affine_node_params, _WARP_AFFINE_PARAM_NUM );
|
||||
final:
|
||||
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
|
||||
{
|
||||
if( ikernels[i] )
|
||||
{
|
||||
vsi_nn_kernel_release( &ikernels[i] );
|
||||
}
|
||||
if( tensors[i] )
|
||||
{
|
||||
vsi_nn_ReleaseTensor( &tensors[i] );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( spatial_transformer, _setup )
|
||||
|
|
@ -0,0 +1,248 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vsi_nn_types.h"
|
||||
#include "vsi_nn_tensor.h"
|
||||
#include "vsi_nn_graph.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_error.h"
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "kernel/vsi_nn_kernel.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
|
||||
__BEGIN_DECLS
|
||||
|
||||
/*
|
||||
* Define kernel meta.
|
||||
*/
|
||||
#define KERNEL_SOURCE "tensorstackconcat",
|
||||
|
||||
#define HASH_SH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d ) \
|
||||
(( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) | (_image_2d))
|
||||
|
||||
#define PACK_KERNEL_8BITS_MAP(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("evis.tensorstackconcat_8bits"), \
|
||||
KERNEL_SOURCE },
|
||||
|
||||
#define PACK_KERNEL_8BITS_MAP_2D(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 1), \
|
||||
CVIVANTE_NAMESPACE("evis.tensorstackconcat_8bits_2D"), \
|
||||
KERNEL_SOURCE },
|
||||
|
||||
#define PACK_KERNEL_16BITS_MAP(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 0), \
|
||||
CVIVANTE_NAMESPACE("evis.tensorstackconcat_16bits"), \
|
||||
KERNEL_SOURCE },
|
||||
|
||||
#define PACK_KERNEL_16BITS_MAP_2D(SRC_TYPE, OUT_TYPE) \
|
||||
{ HASH_SH_KEY(SRC_TYPE, OUT_TYPE, 1), \
|
||||
CVIVANTE_NAMESPACE("evis.tensorstackconcat_16bits_2D"), \
|
||||
KERNEL_SOURCE },
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint32_t key;
|
||||
char * function_name;
|
||||
const char * source_name;
|
||||
} _kernel_map_type;
|
||||
|
||||
static const _kernel_map_type _tensorstackconcat_kernel_map[] =
|
||||
{
|
||||
// Register kernel here
|
||||
PACK_KERNEL_8BITS_MAP( I8, I8 )
|
||||
PACK_KERNEL_8BITS_MAP( U8, U8 )
|
||||
PACK_KERNEL_8BITS_MAP_2D( I8, I8 )
|
||||
PACK_KERNEL_8BITS_MAP_2D( U8, U8 )
|
||||
|
||||
PACK_KERNEL_16BITS_MAP( F16, F16 )
|
||||
PACK_KERNEL_16BITS_MAP( BF16, BF16 )
|
||||
PACK_KERNEL_16BITS_MAP( I16, I16 )
|
||||
PACK_KERNEL_16BITS_MAP_2D( F16, F16 )
|
||||
PACK_KERNEL_16BITS_MAP_2D( BF16, BF16 )
|
||||
PACK_KERNEL_16BITS_MAP_2D( I16, I16 )
|
||||
};
|
||||
|
||||
/*
|
||||
* Kernel params
|
||||
*/
|
||||
static vx_param_description_t _tensorstackconcat_kernel_param_def[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
#define _TENSORSTACKCONCAT_PARAM_NUM _cnt_of_array( _tensorstackconcat_kernel_param_def )
|
||||
|
||||
/*
|
||||
* Kernel initializer
|
||||
*/
|
||||
DEF_KERNEL_INITIALIZER(_tensorstackconcat_initializer)
|
||||
(
|
||||
vsi_nn_kernel_node_t node,
|
||||
const vsi_nn_kernel_node_param_t * param,
|
||||
size_t param_size
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
gpu_param_t gpu_param = {
|
||||
3,
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0},
|
||||
{0, 0, 0}
|
||||
};
|
||||
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
|
||||
vsi_int_array_t * in_shape = NULL;
|
||||
// Add initializer
|
||||
|
||||
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
|
||||
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
|
||||
|
||||
in_shape = input_attr->shape;
|
||||
|
||||
if (input_attr->dtype == I16 || input_attr->dtype == F16)
|
||||
{
|
||||
gpu_param.global_scale[0] = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
gpu_param.global_scale[0] = 16;
|
||||
}
|
||||
gpu_param.global_scale[1] = 1;
|
||||
gpu_param.global_scale[2] = 1;
|
||||
|
||||
gpu_param.global_size[0] = gpu_align_p2(
|
||||
(in_shape->data[0] + gpu_param.global_scale[0] - 1)
|
||||
/ gpu_param.global_scale[0], 4);
|
||||
gpu_param.global_size[1] = 1;
|
||||
gpu_param.global_size[2] = in_shape->size > 2 ? in_shape->data[2] : 1;
|
||||
|
||||
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
|
||||
|
||||
final:
|
||||
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
|
||||
SAFE_FREE_TENSOR_ATTR(input_attr);
|
||||
#undef SAFE_FREE_TENSOR_ATTR
|
||||
|
||||
return status;
|
||||
} /* _tensorstackconcat_initializer() */
|
||||
|
||||
|
||||
/*
|
||||
* Query kernel
|
||||
*/
|
||||
static vsi_status _query_kernel
|
||||
(
|
||||
vsi_nn_kernel_t * kernel,
|
||||
vsi_nn_tensor_t * const * const inputs,
|
||||
vsi_nn_tensor_t * const * const outputs,
|
||||
vsi_bool image_2d
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_dtype_e in_dtype;
|
||||
vsi_nn_kernel_dtype_e out_dtype;
|
||||
const _kernel_map_type * kernel_map = _tensorstackconcat_kernel_map;
|
||||
size_t kernel_map_size = _cnt_of_array( _tensorstackconcat_kernel_map );
|
||||
vx_param_description_t * param_def = _tensorstackconcat_kernel_param_def;
|
||||
vx_kernel_initialize_f initializer = _tensorstackconcat_initializer;
|
||||
|
||||
uint32_t key;
|
||||
uint32_t i;
|
||||
|
||||
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
|
||||
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
|
||||
|
||||
key = HASH_SH_KEY( in_dtype, out_dtype, image_2d );
|
||||
|
||||
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
|
||||
{
|
||||
if ( kernel_map[i].key == key )
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if ( i < (uint32_t)kernel_map_size )
|
||||
{
|
||||
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
|
||||
kernel->info.parameters = param_def;
|
||||
kernel->info.numParams = _cnt_of_array( _tensorstackconcat_kernel_param_def );
|
||||
kernel->info.initialize = initializer;
|
||||
// Register code source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
|
||||
"vsi_nn_kernel_header",
|
||||
kernel_map[i].source_name );
|
||||
// Register binary source
|
||||
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
|
||||
kernel_map[i].source_name );
|
||||
status = VSI_SUCCESS;
|
||||
}
|
||||
return status;
|
||||
} /* _query_kernel() */
|
||||
|
||||
|
||||
static vsi_nn_kernel_node_t _setup
|
||||
(
|
||||
vsi_nn_graph_t * graph,
|
||||
vsi_nn_tensor_t ** inputs,
|
||||
size_t input_num,
|
||||
vsi_nn_tensor_t ** outputs,
|
||||
size_t output_num,
|
||||
const vsi_nn_kernel_param_t * params,
|
||||
vsi_nn_kernel_t * kernel
|
||||
)
|
||||
{
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vsi_nn_kernel_node_param_t node_params[_TENSORSTACKCONCAT_PARAM_NUM];
|
||||
vsi_nn_kernel_node_t node = NULL;
|
||||
vsi_bool image_2d = FALSE;
|
||||
|
||||
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
|
||||
status = _query_kernel( kernel, inputs, outputs, image_2d );
|
||||
if ( VSI_SUCCESS == status)
|
||||
{
|
||||
node = vsi_nn_kernel_create_node( graph, kernel );
|
||||
if ( node )
|
||||
{
|
||||
/* Set inputs and outputs */
|
||||
vsi_nn_kernel_node_pack_io( node_params, _TENSORSTACKCONCAT_PARAM_NUM,
|
||||
inputs, input_num, outputs, output_num );
|
||||
/* Pass parameters to node. */
|
||||
status = vsi_nn_kernel_node_pass_param( node, node_params, _TENSORSTACKCONCAT_PARAM_NUM );
|
||||
}
|
||||
}
|
||||
|
||||
return node;
|
||||
} /* _setup() */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
REGISTER_BACKEND_EVIS( tensorstackconcat, _setup )
|
||||
|
|
@ -444,14 +444,15 @@ static vsi_status _gpu_register
|
|||
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
|
||||
{
|
||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=2" );
|
||||
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
|
||||
context->config.use_40bits_va );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
|
||||
"-cl-viv-vx-extension -D VX_VERSION=%d",
|
||||
context->config.evis.ver );
|
||||
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
|
||||
context->config.evis.ver, context->config.use_40bits_va );
|
||||
}
|
||||
// Pack build option
|
||||
if( kernel->gpu.sources[active_fmt].build_option.data )
|
||||
|
|
@ -812,7 +813,6 @@ void vsi_nn_kernel_add_build_option
|
|||
}
|
||||
snprintf( &buf[org_size], item_size + 2, " %s", option );
|
||||
build_option->data = buf;
|
||||
|
||||
} /* vsi_nn_kernel_add_build_option() */
|
||||
|
||||
void vsi_nn_kernel_release
|
||||
|
|
@ -1224,18 +1224,7 @@ vsi_status vsi_nn_kernel_pirority_set
|
|||
|
||||
static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
|
||||
{
|
||||
char *envctrl;
|
||||
static int32_t enableShader = -1;
|
||||
|
||||
if (enableShader == -1)
|
||||
{
|
||||
enableShader = 1;
|
||||
envctrl = getenv("VIV_VX_ENABLE_SHADER");
|
||||
if (envctrl)
|
||||
{
|
||||
enableShader = atoi(envctrl);
|
||||
}
|
||||
}
|
||||
int32_t enableShader = graph->ctx->options.enable_shader;
|
||||
|
||||
#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
|
||||
if ( graph->ctx->config.subGroupSize == 0 )
|
||||
|
|
@ -1251,4 +1240,3 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
|
|||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -127,5 +127,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_sigmoid)
|
|||
REGISTER_VX_FIRST_KERNEL_SELECTOR(clip)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(relu_keras)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(erf)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu)
|
||||
REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu)
|
||||
|
||||
__END_DECLS
|
||||
|
|
|
|||
|
|
@ -78,6 +78,59 @@ static float mish_eval(float data, float alpha)
|
|||
return data;
|
||||
}
|
||||
|
||||
static float erf_eval(float x)
|
||||
{
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1; /*n!*/
|
||||
float x_pow = x;
|
||||
int32_t one = 1;
|
||||
int32_t n = 1;
|
||||
|
||||
if (x <= -3)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
else if (x >= 3)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (vsi_abs(tmp) > 1e-5)
|
||||
{
|
||||
res += tmp;
|
||||
|
||||
factorial *= n;
|
||||
one *= -1;
|
||||
x_pow *= x * x;
|
||||
tmp = one / factorial * x_pow / ( 2 * n + 1);
|
||||
|
||||
n ++;
|
||||
}
|
||||
#define VSI_MUL2_RSQRTPI (1.1283791670955126f)
|
||||
|
||||
res *= VSI_MUL2_RSQRTPI;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static float gelu_eval(float data, float alpha)
|
||||
{
|
||||
data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f))));
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
|
||||
#define VSI_SQRT_2_RCP_PI 0.7978845834732056f
|
||||
static float hgelu_eval(float data, float alpha)
|
||||
{
|
||||
float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI *
|
||||
(data + 0.044715f * data * data * data)))));
|
||||
|
||||
return data * cdf;
|
||||
}
|
||||
|
||||
#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
|
||||
static int32_t _lut_comparator(const void *pa, const void *pb)
|
||||
{
|
||||
|
|
@ -232,6 +285,8 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( log, log_eval )
|
|||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( elu, elu_eval )
|
||||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( neg, neg_eval )
|
||||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_sigmoid, hsigmoid_eval )
|
||||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( gelu, gelu_eval )
|
||||
REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_gelu, hgelu_eval )
|
||||
|
||||
#undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL
|
||||
|
||||
|
|
|
|||
|
|
@ -38,8 +38,9 @@ typedef struct _sort_lut_s
|
|||
float val;
|
||||
} sort_lut;
|
||||
|
||||
static float erf_eval(float x)
|
||||
static float erf_eval(float _x)
|
||||
{
|
||||
float x = vsi_clamp(_x, -2, 2);
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1; /*n!*/
|
||||
|
|
|
|||
|
|
@ -1,4 +1,10 @@
|
|||
|
||||
#define READ_IMAGEF_ARRAY2D(dest, tensor, coord) \
|
||||
do { \
|
||||
int depth = get_image_array_size(tensor); \
|
||||
_viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, in0_depth - 1); \
|
||||
dest = read_imagef(tensor, coord); \
|
||||
} while(0)
|
||||
__kernel void batch_norm_F32toF32
|
||||
(
|
||||
__read_only image2d_array_t input,
|
||||
|
|
@ -17,11 +23,11 @@ __kernel void batch_norm_F32toF32
|
|||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
|
||||
float4 src, mean, var, gamma, beta;
|
||||
readImage2DArray(src, input, coord);
|
||||
readImage2DArray(mean, Mean, coord);
|
||||
readImage2DArray(var, Variance, coord);
|
||||
readImage2DArray(gamma, Gamma, coord);
|
||||
readImage2DArray(beta, Beta, coord);
|
||||
READ_IMAGEF_2DARRAY(src, input, coord);
|
||||
READ_IMAGEF_2DARRAY(mean, Mean, coord);
|
||||
READ_IMAGEF_2DARRAY(var, Variance, coord);
|
||||
READ_IMAGEF_2DARRAY(gamma, Gamma, coord);
|
||||
READ_IMAGEF_2DARRAY(beta, Beta, coord);
|
||||
|
||||
float4 dst;
|
||||
src.x = src.x - mean.x;
|
||||
|
|
@ -81,11 +87,11 @@ __kernel void batch_norm_U8toU8
|
|||
|
||||
uint4 data;
|
||||
float4 src, mean, var, gamma, beta;
|
||||
readImage2DArray(data, input, coord);
|
||||
readImage2DArray(mean, Mean, coord);
|
||||
readImage2DArray(var, Variance, coord);
|
||||
readImage2DArray(gamma, Gamma, coord);
|
||||
readImage2DArray(beta, Beta, coord);
|
||||
READ_IMAGEF_2DARRAY(data, input, coord);
|
||||
READ_IMAGEF_2DARRAY(mean, Mean, coord);
|
||||
READ_IMAGEF_2DARRAY(var, Variance, coord);
|
||||
READ_IMAGEF_2DARRAY(gamma, Gamma, coord);
|
||||
READ_IMAGEF_2DARRAY(beta, Beta, coord);
|
||||
|
||||
src = convert_float4(data) * input_scale - input_tail;
|
||||
src.x = src.x - mean.x;
|
||||
|
|
|
|||
|
|
@ -18,11 +18,19 @@ inline Image create_image_from_image2d(image2d_t input, int stride_x)
|
|||
int8 desc;
|
||||
_viv_asm(COPY, desc, input, sizeof(desc));
|
||||
|
||||
#if (USE_40BITS_VA==0)
|
||||
uint address = as_uint(desc.s0);
|
||||
int stride_y = desc.s1;
|
||||
#else
|
||||
ulong address = as_ulong(desc.s05);
|
||||
int stride_y = desc.s6;
|
||||
#endif
|
||||
|
||||
Image img =
|
||||
{
|
||||
.ptr = (uchar*)desc.s0,
|
||||
.ptr = (uchar*)address,
|
||||
.stride_x = stride_x,
|
||||
.stride_y = desc.s1
|
||||
.stride_y = stride_y
|
||||
};
|
||||
|
||||
return img;
|
||||
|
|
@ -36,53 +44,60 @@ typedef struct Tensor
|
|||
int stride_z;
|
||||
} Tensor;
|
||||
|
||||
inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)
|
||||
inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord)
|
||||
{
|
||||
return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;
|
||||
}
|
||||
|
||||
inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)
|
||||
{
|
||||
#if (USE_40BITS_VA==0)
|
||||
int8 desc;
|
||||
_viv_asm(COPY, desc, input, sizeof(desc));
|
||||
|
||||
uint address = as_uint(desc.s0);
|
||||
int stride_y = desc.s1;
|
||||
int stride_z = desc.s4;
|
||||
#else
|
||||
int16 desc;
|
||||
_viv_asm(COPY, desc, input, sizeof(desc));
|
||||
|
||||
ulong address = as_ulong(desc.s05);
|
||||
int stride_y = desc.s6;
|
||||
int stride_z = desc.sa;
|
||||
#endif
|
||||
|
||||
Tensor t =
|
||||
{
|
||||
.ptr = (uchar*)desc.s0,
|
||||
.ptr = (uchar*)address,
|
||||
.stride_x = stride_x,
|
||||
.stride_y = desc.s1,
|
||||
.stride_z = desc.s4
|
||||
.stride_y = stride_y,
|
||||
.stride_z = stride_z
|
||||
};
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
#define readImage2DArray(Dest, Image, Coord) \
|
||||
do { \
|
||||
int8 desc; \
|
||||
_viv_asm(COPY, desc, Image, sizeof(desc)); \
|
||||
_viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \
|
||||
int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \
|
||||
_viv_asm(MOV, (Coord).w, baseAddr); \
|
||||
_viv_asm(IMAGE_READ_3D, Dest, Image, (Coord).xyww); \
|
||||
} while (0)
|
||||
#define READ_IMAGEF_2DARRAY(dest, tensor, coord) \
|
||||
do { \
|
||||
int depth = get_image_array_size(tensor); \
|
||||
int4 coord_in = coord; \
|
||||
_viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
|
||||
dest = read_imagef(tensor, coord_in); \
|
||||
} while(0)
|
||||
|
||||
#define writeImage2DArray(Image, Coord, Color) \
|
||||
do { \
|
||||
int8 desc; \
|
||||
_viv_asm(COPY, desc, Image, sizeof(desc)); \
|
||||
_viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1); \
|
||||
int baseAddr = (int)(Coord).w * desc.s4 + desc.s0; \
|
||||
_viv_asm(MOV, (Coord).w, baseAddr); \
|
||||
_viv_asm(IMAGE_WRITE_3D, Color, Image, (Coord).xyww); \
|
||||
} while (0)
|
||||
#define READ_IMAGEI_2DARRAY(dest, tensor, coord) \
|
||||
do { \
|
||||
int depth = get_image_array_size(tensor); \
|
||||
int4 coord_in = coord; \
|
||||
_viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
|
||||
dest = read_imagei(tensor, coord_in); \
|
||||
} while(0)
|
||||
|
||||
#define readImage(Dest, Image, Coord) \
|
||||
do { \
|
||||
_viv_asm(IMAGE_READ, Dest, Image, Coord); \
|
||||
} while (0)
|
||||
|
||||
#define writeImage(Image, Coord, Color) \
|
||||
do { \
|
||||
_viv_asm(IMAGE_WRITE, Color, Image, Coord); \
|
||||
} while (0)
|
||||
#define READ_IMAGEUI_2DARRAY(dest, tensor, coord) \
|
||||
do { \
|
||||
int depth = get_image_array_size(tensor); \
|
||||
int4 coord_in = coord; \
|
||||
_viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
|
||||
dest = read_imageui(tensor, coord_in); \
|
||||
} while(0)
|
||||
|
|
|
|||
|
|
@ -1,12 +1,12 @@
|
|||
|
||||
float4 eltwise_unary_sin(float4 x, float alpha)
|
||||
float eltwise_unary_sin(float x, float alpha)
|
||||
{
|
||||
return native_sin(x);
|
||||
}
|
||||
|
||||
#define logE (1.44269502f)
|
||||
#define twoLogE (logE * 2.0f)
|
||||
float4 eltwise_unary_exp(float4 x, float alpha)
|
||||
float eltwise_unary_exp(float x, float alpha)
|
||||
{
|
||||
x *= logE;
|
||||
x = exp2(x);
|
||||
|
|
@ -14,33 +14,33 @@ float4 eltwise_unary_exp(float4 x, float alpha)
|
|||
}
|
||||
|
||||
#define rlogE (0.693147182f)
|
||||
float4 eltwise_unary_log(float4 x, float alpha)
|
||||
float eltwise_unary_log(float x, float alpha)
|
||||
{
|
||||
x = log2(x);
|
||||
return x * rlogE;
|
||||
}
|
||||
|
||||
float4 eltwise_unary_elu(float4 val, float alpha)
|
||||
float eltwise_unary_elu(float val, float alpha)
|
||||
{
|
||||
float4 x = val * logE;
|
||||
float x = val * logE;
|
||||
x = exp2(x) * alpha - alpha;
|
||||
|
||||
return val < 0 ? x : val;
|
||||
}
|
||||
|
||||
float4 eltwise_unary_neg(float4 x, float alpha)
|
||||
float eltwise_unary_neg(float x, float alpha)
|
||||
{
|
||||
return x * -1;
|
||||
}
|
||||
|
||||
float4 eltwise_unary_hard_sigmoid(float4 x, float alpha)
|
||||
float eltwise_unary_hard_sigmoid(float x, float alpha)
|
||||
{
|
||||
x = 0.2 * x + 0.5;
|
||||
x = clamp(x, 0, 1);
|
||||
return x;
|
||||
}
|
||||
|
||||
float4 _softrelu(float4 x, float alpha)
|
||||
float _softrelu(float x, float alpha)
|
||||
{
|
||||
x *= logE;
|
||||
x = exp2(x);
|
||||
|
|
@ -49,7 +49,7 @@ float4 _softrelu(float4 x, float alpha)
|
|||
return x * rlogE;
|
||||
}
|
||||
|
||||
float4 _tanh(float4 x, float alpha)
|
||||
float _tanh(float x, float alpha)
|
||||
{
|
||||
x *= -twoLogE;
|
||||
x = 1 + exp2(x);
|
||||
|
|
@ -57,16 +57,60 @@ float4 _tanh(float4 x, float alpha)
|
|||
return (2 * x - 1);
|
||||
}
|
||||
|
||||
float4 eltwise_unary_mish(float4 x, float alpha)
|
||||
float eltwise_unary_mish(float x, float alpha)
|
||||
{
|
||||
float4 y = _softrelu(x, alpha);
|
||||
float y = _softrelu(x, alpha);
|
||||
x = x * _tanh(y, alpha);
|
||||
return x;
|
||||
}
|
||||
|
||||
float4 eltwise_unary_round(float4 x, float alpha)
|
||||
float eltwise_unary_round(float x, float alpha)
|
||||
{
|
||||
return convert_float4(convert_int4_rte(x));
|
||||
return convert_float(convert_int_rte(x));
|
||||
}
|
||||
|
||||
#define MUL2_RSQRTPI (1.1283791670955126f)
|
||||
float erf_eval(float x)
|
||||
{
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1;
|
||||
float x_pow = x;
|
||||
float one = 1.0f;
|
||||
float n = 1;
|
||||
|
||||
if (x <= -3)
|
||||
return -1;
|
||||
else if (x >= 3)
|
||||
return 1;
|
||||
|
||||
while (fabs(tmp) > 1e-5)
|
||||
{
|
||||
res += tmp;
|
||||
|
||||
factorial *= n;
|
||||
one *= -1;
|
||||
x_pow *= x * x;
|
||||
tmp = one / factorial * x_pow / ( 2 * n + 1);
|
||||
|
||||
n += 1.0f;
|
||||
}
|
||||
return res * MUL2_RSQRTPI;
|
||||
}
|
||||
#define RSQRT2 (0.70710678118654752440084436210485f)
|
||||
float eltwise_unary_gelu(float x, float alpha)
|
||||
{
|
||||
x = 0.5f * x * (1 + erf_eval(x * RSQRT2));
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
#define SQRT_2_RCP_PI 0.7978845834732056f
|
||||
float eltwise_unary_hard_gelu(float x, float alpha)
|
||||
{
|
||||
float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *
|
||||
(x + 0.044715f * x * x * x), 0);
|
||||
return x * cdf;
|
||||
}
|
||||
|
||||
#define ELTWISE_UNARY_F32(func_name) \
|
||||
|
|
@ -85,9 +129,10 @@ __kernel void func_name##_F32toF32 \
|
|||
\
|
||||
float4 src = read_imagef(input, coord); \
|
||||
\
|
||||
float4 dst = eltwise_unary_##func_name(src, alpha); \
|
||||
float4 dst = 0; \
|
||||
dst.x = eltwise_unary_##func_name(src.x, alpha); \
|
||||
\
|
||||
write_imagef(output, coord, dst); \
|
||||
write_imagef(output, coord, dst.xxxx); \
|
||||
}
|
||||
ELTWISE_UNARY_F32(sin)
|
||||
ELTWISE_UNARY_F32(exp)
|
||||
|
|
@ -97,6 +142,8 @@ ELTWISE_UNARY_F32(neg)
|
|||
ELTWISE_UNARY_F32(mish)
|
||||
ELTWISE_UNARY_F32(hard_sigmoid)
|
||||
ELTWISE_UNARY_F32(round)
|
||||
ELTWISE_UNARY_F32(gelu)
|
||||
ELTWISE_UNARY_F32(hard_gelu)
|
||||
|
||||
#define ELTWISE_UNARY_F32_2D(func_name) \
|
||||
__kernel void func_name##_F32toF32_2D \
|
||||
|
|
@ -114,9 +161,10 @@ __kernel void func_name##_F32toF32_2D \
|
|||
\
|
||||
float4 src = read_imagef(input, coord); \
|
||||
\
|
||||
float4 dst = eltwise_unary_##func_name(src, alpha); \
|
||||
float4 dst = 0; \
|
||||
dst.x = eltwise_unary_##func_name(src.x, alpha); \
|
||||
\
|
||||
write_imagef(output, coord, dst); \
|
||||
write_imagef(output, coord, dst.xxxx); \
|
||||
}
|
||||
ELTWISE_UNARY_F32_2D(sin)
|
||||
ELTWISE_UNARY_F32_2D(exp)
|
||||
|
|
@ -126,6 +174,8 @@ ELTWISE_UNARY_F32_2D(neg)
|
|||
ELTWISE_UNARY_F32_2D(mish)
|
||||
ELTWISE_UNARY_F32_2D(hard_sigmoid)
|
||||
ELTWISE_UNARY_F32_2D(round)
|
||||
ELTWISE_UNARY_F32_2D(gelu)
|
||||
ELTWISE_UNARY_F32_2D(hard_gelu)
|
||||
|
||||
#define ELTWISE_UNARY_U8(func_name) \
|
||||
__kernel void func_name##_U8toU8 \
|
||||
|
|
@ -144,7 +194,7 @@ __kernel void func_name##_U8toU8 \
|
|||
uint4 src = read_imageui(input, coord); \
|
||||
float4 data = convert_float4(src) * inputScale - inputTail; \
|
||||
\
|
||||
data = eltwise_unary_##func_name(data, alpha); \
|
||||
data.x = eltwise_unary_##func_name(data.x, alpha); \
|
||||
uint4 dst = convert_uint4(data * outputScale + outputZP); \
|
||||
\
|
||||
write_imageui(output, coord, dst); \
|
||||
|
|
@ -157,6 +207,8 @@ ELTWISE_UNARY_U8(neg)
|
|||
ELTWISE_UNARY_U8(mish)
|
||||
ELTWISE_UNARY_U8(hard_sigmoid)
|
||||
ELTWISE_UNARY_U8(round)
|
||||
ELTWISE_UNARY_U8(gelu)
|
||||
ELTWISE_UNARY_U8(hard_gelu)
|
||||
|
||||
#define ELTWISE_UNARY_U8_2D(func_name) \
|
||||
__kernel void func_name##_U8toU8_2D \
|
||||
|
|
@ -175,7 +227,7 @@ __kernel void func_name##_U8toU8_2D \
|
|||
uint4 src = read_imageui(input, coord); \
|
||||
float4 data = convert_float4(src) * inputScale - inputTail; \
|
||||
\
|
||||
data = eltwise_unary_##func_name(data, alpha); \
|
||||
data.x = eltwise_unary_##func_name(data.x, alpha); \
|
||||
uint4 dst = convert_uint4(data * outputScale + outputZP); \
|
||||
\
|
||||
write_imageui(output, coord, dst); \
|
||||
|
|
@ -188,6 +240,8 @@ ELTWISE_UNARY_U8_2D(neg)
|
|||
ELTWISE_UNARY_U8_2D(mish)
|
||||
ELTWISE_UNARY_U8_2D(hard_sigmoid)
|
||||
ELTWISE_UNARY_U8_2D(round)
|
||||
ELTWISE_UNARY_U8_2D(gelu)
|
||||
ELTWISE_UNARY_U8_2D(hard_gelu)
|
||||
|
||||
__kernel void neg_I32toI32
|
||||
(
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#define MUL2_RSQRTPI (1.1283791670955126f)
|
||||
float eltwise_unary_erf(float x)
|
||||
float eltwise_unary_erf(float _x)
|
||||
{
|
||||
float x = clamp(_x, -2, 2);
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1;
|
||||
|
|
|
|||
|
|
@ -6,8 +6,8 @@ __kernel void floordiv_F32F32toF32(
|
|||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
float4 src0;
|
||||
float4 src1;
|
||||
readImage2DArray(src0, input, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEF_2DARRAY(src0, input, coord);
|
||||
READ_IMAGEF_2DARRAY(src1, input1, coord);
|
||||
float4 dst = floor(src0 / src1);
|
||||
write_imagef(output, coord, dst);
|
||||
}
|
||||
|
|
@ -32,8 +32,8 @@ __kernel void floordiv_I32I32toI32(
|
|||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 src0;
|
||||
int4 src1;
|
||||
readImage2DArray(src0, input, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEI_2DARRAY(src0, input, coord);
|
||||
READ_IMAGEI_2DARRAY(src1, input1, coord);
|
||||
int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));
|
||||
write_imagei(output, coord, dst);
|
||||
}
|
||||
|
|
@ -64,8 +64,8 @@ __kernel void floordiv_I32I32toU8(
|
|||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 src0;
|
||||
int4 src1;
|
||||
readImage2DArray(src0, input, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEI_2DARRAY(src0, input, coord);
|
||||
READ_IMAGEI_2DARRAY(src1, input1, coord);
|
||||
uint4 dst = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);
|
||||
write_imageui(output, coord, dst);
|
||||
}
|
||||
|
|
@ -102,8 +102,8 @@ __kernel void floordiv_U8U8toU8(
|
|||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
uint4 src0, src1;
|
||||
float4 in0, in1, out;
|
||||
readImage2DArray(src0, input, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEUI_2DARRAY(src0, input, coord);
|
||||
READ_IMAGEUI_2DARRAY(src1, input1, coord);
|
||||
in0 = convert_float4(src0) * input0Scale + input0Tail;
|
||||
in1 = convert_float4(src1) * input1Scale + input1Tail;
|
||||
out = floor(in0 / in1) * outputScale + outputTail;
|
||||
|
|
@ -148,8 +148,8 @@ __kernel void floordiv_U8I32toU8(
|
|||
uint4 src0;
|
||||
int4 src1;
|
||||
float4 in0, in1, out;
|
||||
readImage2DArray(src0, input, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEUI_2DARRAY(src0, input, coord);
|
||||
READ_IMAGEI_2DARRAY(src1, input1, coord);
|
||||
in0 = convert_float4(src0) * input0Scale + input0Tail;
|
||||
in1 = convert_float4(src1);
|
||||
out = floor(in0 / in1) * outputScale + outputTail;
|
||||
|
|
|
|||
|
|
@ -7,8 +7,8 @@ __kernel void logical_##name##_I8toI8( \
|
|||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
|
||||
int4 src0; \
|
||||
int4 src1; \
|
||||
readImage2DArray(src0, input, coord); \
|
||||
readImage2DArray(src1, input1, coord); \
|
||||
READ_IMAGEI_2DARRAY(src0, input, coord); \
|
||||
READ_IMAGEI_2DARRAY(src1, input1, coord); \
|
||||
int4 dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \
|
||||
dst.x = dst.x & 1; \
|
||||
write_imagei(output, coord, dst); \
|
||||
|
|
|
|||
|
|
@ -15,8 +15,8 @@ __kernel void maximum_FP32FP32toFP32
|
|||
|
||||
float4 src0;
|
||||
float4 src1;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEF_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEF_2DARRAY(src1, input1, coord);
|
||||
|
||||
float4 dst = src0 > src1 ? src0 : src1;
|
||||
|
||||
|
|
@ -63,8 +63,8 @@ __kernel void maximum_U8U8toU8
|
|||
|
||||
uint4 src0;
|
||||
uint4 src1;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEUI_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEUI_2DARRAY(src1, input1, coord);
|
||||
|
||||
float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
|
||||
float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
|
||||
|
|
@ -118,8 +118,8 @@ __kernel void maximum_I32I32toI32
|
|||
|
||||
int4 src0;
|
||||
int4 src1;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEI_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEI_2DARRAY(src1, input1, coord);
|
||||
|
||||
int4 dst = src0 > src1 ? src0 : src1;
|
||||
|
||||
|
|
|
|||
|
|
@ -15,8 +15,8 @@ __kernel void minimum_FP32FP32toFP32
|
|||
|
||||
float4 src0;
|
||||
float4 src1;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEF_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEF_2DARRAY(src1, input1, coord);
|
||||
|
||||
float4 dst = src0 < src1 ? src0 : src1;
|
||||
|
||||
|
|
@ -63,8 +63,8 @@ __kernel void minimum_U8U8toU8
|
|||
|
||||
uint4 src0;
|
||||
uint4 src1;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEUI_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEUI_2DARRAY(src1, input1, coord);
|
||||
|
||||
float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
|
||||
float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
|
||||
|
|
@ -118,8 +118,8 @@ __kernel void minimum_I32I32toI32
|
|||
|
||||
int4 src0;
|
||||
int4 src1;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEI_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEI_2DARRAY(src1, input1, coord);
|
||||
|
||||
int4 dst = src0 < src1 ? src0 : src1;
|
||||
|
||||
|
|
|
|||
|
|
@ -9,8 +9,8 @@ __kernel void pow_FP32FP32toFP32
|
|||
|
||||
float4 src0, src1;
|
||||
float4 dst;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEF_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEF_2DARRAY(src1, input1, coord);
|
||||
|
||||
float4 s0 = sign(src0);
|
||||
int4 t0 = convert_int4(src1) & 1;
|
||||
|
|
|
|||
|
|
@ -15,8 +15,8 @@ __kernel void prelu_FP32FP32toFP32
|
|||
|
||||
float4 src0;
|
||||
float4 src1;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEF_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEF_2DARRAY(src1, input1, coord);
|
||||
|
||||
float4 maxData = src0 >= 0 ? src0 : 0;
|
||||
float4 minData = src0 < 0 ? src0 : 0;
|
||||
|
|
@ -67,8 +67,8 @@ __kernel void prelu_U8U8toU8
|
|||
|
||||
uint4 src0;
|
||||
uint4 src1;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEUI_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEUI_2DARRAY(src1, input1, coord);
|
||||
|
||||
float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
|
||||
float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
|
||||
|
|
@ -130,8 +130,8 @@ __kernel void prelu_I32I32toI32
|
|||
|
||||
int4 src0;
|
||||
int4 src1;
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEI_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEI_2DARRAY(src1, input1, coord);
|
||||
|
||||
float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
|
||||
float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
|
||||
|
|
|
|||
|
|
@ -1,14 +1,5 @@
|
|||
#pragma OPENCL EXTENSION CL_VIV_asm : enable
|
||||
|
||||
inline uchar* get_image2D_array_ptr(image2d_array_t input)
|
||||
{
|
||||
int8 desc;
|
||||
_viv_asm(COPY, desc, input, sizeof(desc));
|
||||
uchar *src_ptr = (uchar*)desc.s0;
|
||||
|
||||
return src_ptr;
|
||||
}
|
||||
|
||||
uint4 _philox4x32bumpkey(uint4 key)
|
||||
{
|
||||
uint4 mask = (uint4)((uint)0x9E3779B9, (uint)0xBB67AE85, 0, 0);
|
||||
|
|
@ -61,14 +52,16 @@ __kernel void random_seed(
|
|||
float re_rand_max
|
||||
)
|
||||
{
|
||||
__global uint* seeds_ptr = (__global uint*)get_image2D_array_ptr(seeds);
|
||||
Tensor s_tensor = create_tensor_from_image2d_array(seeds, 4);
|
||||
__global uint* seeds_ptr = (__global uint*)s_tensor.ptr;
|
||||
seeds_ptr = seeds_ptr;
|
||||
uint4 key = vload4(0, seeds_ptr);
|
||||
|
||||
uint4 ctr = (uint4)(0);
|
||||
float4 result = 0;
|
||||
|
||||
__global float* output_ptr = (__global float*)get_image2D_array_ptr(output);
|
||||
Tensor o_tensor = create_tensor_from_image2d_array(output, 4);
|
||||
__global float* output_ptr = (__global float*)o_tensor.ptr;
|
||||
|
||||
for(int i = 0; i < iter; i++)
|
||||
{
|
||||
|
|
@ -152,17 +145,20 @@ __kernel void random_multinomial
|
|||
int class_size = get_image_width(cdfs);
|
||||
|
||||
int offset = gidy * class_size;
|
||||
__global float* cdf_ptr = (__global float*)get_image2D_array_ptr(cdfs);
|
||||
Tensor cdf_tensor = create_tensor_from_image2d_array(cdfs, 4);
|
||||
__global float* cdf_ptr = (__global uint*)cdf_tensor.ptr;
|
||||
__global float* cdfPtr = cdf_ptr + offset;
|
||||
|
||||
int width = get_image_width(randoms);
|
||||
offset = coord.x + coord.y * width;
|
||||
__global float* randoms_ptr = (__global float*)get_image2D_array_ptr(randoms);
|
||||
Tensor r_tensor = create_tensor_from_image2d_array(randoms, 4);
|
||||
__global float* randoms_ptr = (__global float*)r_tensor.ptr;
|
||||
randoms_ptr = randoms_ptr + offset;
|
||||
|
||||
width = get_image_width(output);
|
||||
offset = coord.x + coord.y * width;
|
||||
__global uint* output_ptr = (__global uint*)get_image2D_array_ptr(output);
|
||||
Tensor o_tensor = create_tensor_from_image2d_array(output, 4);
|
||||
__global uint* output_ptr = (__global uint*)o_tensor.ptr;
|
||||
output_ptr = output_ptr + offset;
|
||||
|
||||
float4 ran = vload4(0, randoms_ptr);
|
||||
|
|
|
|||
|
|
@ -15,8 +15,8 @@ __kernel void func_name##_F32F32toBOOL8 \
|
|||
\
|
||||
float4 src0; \
|
||||
float4 src1; \
|
||||
readImage2DArray(src0, input0, coord); \
|
||||
readImage2DArray(src1, input1, coord); \
|
||||
READ_IMAGEF_2DARRAY(src0, input0, coord); \
|
||||
READ_IMAGEF_2DARRAY(src1, input1, coord); \
|
||||
\
|
||||
int4 dst = (src0)comp_op(src1); \
|
||||
dst &= 1; \
|
||||
|
|
@ -75,8 +75,8 @@ __kernel void func_name##_U32U32toBOOL8 \
|
|||
\
|
||||
uint4 data0; \
|
||||
uint4 data1; \
|
||||
readImage2DArray(data0, input0, coord); \
|
||||
readImage2DArray(data1, input1, coord); \
|
||||
READ_IMAGEUI_2DARRAY(data0, input0, coord); \
|
||||
READ_IMAGEUI_2DARRAY(data1, input1, coord); \
|
||||
\
|
||||
float4 src0 = convert_float4(data0) * input0Scale - input0Tail; \
|
||||
float4 src1 = convert_float4(data1) * input1Scale - input1Tail; \
|
||||
|
|
@ -139,8 +139,8 @@ __kernel void func_name##_I32I32toBOOL8 \
|
|||
\
|
||||
int4 src0; \
|
||||
int4 src1; \
|
||||
readImage2DArray(src0, input0, coord); \
|
||||
readImage2DArray(src1, input1, coord); \
|
||||
READ_IMAGEI_2DARRAY(src0, input0, coord); \
|
||||
READ_IMAGEI_2DARRAY(src1, input1, coord); \
|
||||
\
|
||||
int4 dst = (src0)comp_op(src1); \
|
||||
dst &= 1; \
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
|
||||
#define SCATTER_ND_UPDATE(src0_type, data_type, read_func, write_func) \
|
||||
__kernel void scatter_nd_update_##src0_type##src0_type##to##src0_type( \
|
||||
__read_only image2d_t input0, \
|
||||
__read_only image2d_t input1, \
|
||||
__read_only image2d_t input2, \
|
||||
__write_only image2d_t output, \
|
||||
int offsetX, \
|
||||
int offsetY, \
|
||||
int offsetZ, \
|
||||
int offsetW, \
|
||||
int offset_idx, \
|
||||
int coord_dim, \
|
||||
int index_num \
|
||||
) \
|
||||
{ \
|
||||
int gidx = get_global_id(0); \
|
||||
int gidy = get_global_id(1); \
|
||||
int cnt = 0; \
|
||||
\
|
||||
data_type sum = (data_type)(0, 0, 0, 0); \
|
||||
Image img1 = create_image_from_image2d(input1, 4); \
|
||||
__global int* index_ptr = (__global int*)img1.ptr; \
|
||||
for(int i = 0; i < index_num; i++) \
|
||||
{ \
|
||||
int4 indice = vload4(0, index_ptr + offset_idx); \
|
||||
index_ptr += coord_dim; \
|
||||
int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW; \
|
||||
if(gidy == idx) \
|
||||
{ \
|
||||
data_type data = read_func(input2, (int2)(gidx, i)); \
|
||||
cnt++; \
|
||||
sum += data; \
|
||||
} \
|
||||
} \
|
||||
int2 coord = (int2)(gidx, gidy); \
|
||||
if(cnt == 0) \
|
||||
{ \
|
||||
sum = read_func(input0, coord); \
|
||||
} \
|
||||
write_func(output, coord, sum); \
|
||||
}
|
||||
SCATTER_ND_UPDATE(U32, uint4, read_imageui, write_imageui)
|
||||
SCATTER_ND_UPDATE(I32, int4, read_imagei, write_imagei)
|
||||
SCATTER_ND_UPDATE(F32, float4, read_imagef, write_imagef)
|
||||
|
|
@ -12,9 +12,9 @@ __kernel void select_I8_U8_U8toU8(
|
|||
int4 value;
|
||||
uint4 src0, src1, src, dst;
|
||||
float inputScale, inputTail;
|
||||
readImage2DArray(value, condition, coord);
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEI_2DARRAY(value, condition, coord);
|
||||
READ_IMAGEF_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEF_2DARRAY(src1, input1, coord);
|
||||
src = (value != 0 ? src0 : src1);
|
||||
inputScale = (value.x != 0 ? input0Scale : input1Scale);
|
||||
inputTail = (value.x != 0 ? input0Tail : input1Tail);
|
||||
|
|
@ -56,9 +56,9 @@ __kernel void select_I8_I32_I32toI32(
|
|||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 value;
|
||||
int4 src0, src1, dst;
|
||||
readImage2DArray(value, condition, coord);
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEI_2DARRAY(value, condition, coord);
|
||||
READ_IMAGEI_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEI_2DARRAY(src1, input1, coord);
|
||||
dst = (value != 0 ? src0 : src1);
|
||||
write_imagei(output, coord, dst);
|
||||
}
|
||||
|
|
@ -94,9 +94,9 @@ __kernel void select_I8_F32_F32toF32(
|
|||
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
|
||||
int4 value;
|
||||
float4 src0, src1, dst;
|
||||
readImage2DArray(value, condition, coord);
|
||||
readImage2DArray(src0, input0, coord);
|
||||
readImage2DArray(src1, input1, coord);
|
||||
READ_IMAGEI_2DARRAY(value, condition, coord);
|
||||
READ_IMAGEF_2DARRAY(src0, input0, coord);
|
||||
READ_IMAGEF_2DARRAY(src1, input1, coord);
|
||||
dst = (value != 0 ? src0 : src1);
|
||||
write_imagef(output, coord, dst);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,21 @@
|
|||
|
||||
#define SIGNAL_FRAME_SH_IMPL(type, data_type, read_imagefunc, write_imagefunc) \
|
||||
__kernel void signal_frame_##type##to##type \
|
||||
( \
|
||||
__read_only image2d_t input, \
|
||||
__write_only image2d_array_t output, \
|
||||
int frame_step \
|
||||
) \
|
||||
{ \
|
||||
int inner = get_global_id(0); \
|
||||
int length_k = get_global_id(1); \
|
||||
int frames_id = get_global_id(2); \
|
||||
\
|
||||
int4 coord = (int4)(inner, length_k, frames_id, frames_id); \
|
||||
int2 coord_in = (int2)(inner, frames_id * frame_step + length_k); \
|
||||
\
|
||||
data_type src = read_imagefunc(input, coord_in); \
|
||||
write_imagefunc(output, coord, src); \
|
||||
}
|
||||
SIGNAL_FRAME_SH_IMPL(F32, float4, read_imagef, write_imagef)
|
||||
SIGNAL_FRAME_SH_IMPL(U8, uint4, read_imageui, write_imageui)
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
#define TILE_3D(name0, name1, data_type, write_image_func) \
|
||||
#define TILE_3D(name0, name1, data_type, read_image_func, write_image_func) \
|
||||
__kernel void tile_##name0##to##name1 \
|
||||
( \
|
||||
__read_only image2d_array_t input, \
|
||||
|
|
@ -19,7 +19,7 @@ __kernel void tile_##name0##to##name1 \
|
|||
int height = get_image_height(input); \
|
||||
\
|
||||
data_type src; \
|
||||
readImage2DArray(src, input, coord); \
|
||||
read_image_func(src, input, coord); \
|
||||
\
|
||||
int batch_id = (short)coord.z / (short)depthIn; \
|
||||
coord.z = (short)coord.z % (short)depthIn; \
|
||||
|
|
@ -46,11 +46,11 @@ __kernel void tile_##name0##to##name1 \
|
|||
} \
|
||||
} \
|
||||
}
|
||||
TILE_3D(I32, I32, int4, write_imagei)
|
||||
TILE_3D(U32, U32, uint4, write_imageui)
|
||||
TILE_3D(F32, F32, float4, write_imagef)
|
||||
TILE_3D(I32, I32, int4, READ_IMAGEI_2DARRAY, write_imagei)
|
||||
TILE_3D(U32, U32, uint4, READ_IMAGEUI_2DARRAY, write_imageui)
|
||||
TILE_3D(F32, F32, float4, READ_IMAGEF_2DARRAY, write_imagef)
|
||||
|
||||
#define TILE_2D(name0, name1, data_type) \
|
||||
#define TILE_2D(name0, name1, data_type, read_image_func, write_image_func) \
|
||||
__kernel void tile_##name0##to##name1##_2D \
|
||||
( \
|
||||
__read_only image2d_t input, \
|
||||
|
|
@ -70,23 +70,22 @@ __kernel void tile_##name0##to##name1##_2D \
|
|||
int output_width = get_image_width(output); \
|
||||
int output_height = get_image_height(output); \
|
||||
\
|
||||
data_type src; \
|
||||
readImage(src, input, coord); \
|
||||
data_type src = read_image_func(input, coord); \
|
||||
\
|
||||
do \
|
||||
{ \
|
||||
do \
|
||||
{ \
|
||||
writeImage(output, coord, src); \
|
||||
write_image_func(output, coord, src); \
|
||||
coord.x += width; \
|
||||
} while (coord.x < output_width); \
|
||||
coord.x = get_global_id(0); \
|
||||
coord.y += height; \
|
||||
} while (coord.y < output_height); \
|
||||
}
|
||||
TILE_2D(I32, I32, int4)
|
||||
TILE_2D(U32, U32, uint4)
|
||||
TILE_2D(F32, F32, float4)
|
||||
TILE_2D(I32, I32, int4, read_imagei, write_imagei)
|
||||
TILE_2D(U32, U32, uint4, read_imageui, write_imageui)
|
||||
TILE_2D(F32, F32, float4, read_imagef, write_imagef)
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,511 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "vsi_nn_platform.h"
|
||||
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_test.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_math.h"
|
||||
#include "utils/vsi_nn_link_list.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _VX_KERNEL_VAR (vx_kernel_BOX_WITH_NMS_LIMIT)
|
||||
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_BOX_WITH_NMS_LIMIT)
|
||||
#define _VX_KERNEL_NAME (VX_KERNEL_NAME_BOX_WITH_NMS_LIMIT)
|
||||
#define _VX_KERNEL_FUNC_KERNEL (vxBox_with_nms_limitKernel)
|
||||
|
||||
static float hard_nms_kernel
|
||||
(
|
||||
float iou,
|
||||
float iouThreshold
|
||||
)
|
||||
{
|
||||
return iou < iouThreshold ? 1.0f : 0.0f;
|
||||
}
|
||||
|
||||
static float linear_nms_kernel
|
||||
(
|
||||
float iou,
|
||||
float iouThreshold
|
||||
)
|
||||
{
|
||||
return iou < iouThreshold ? 1.0f : 1.0f - iou;
|
||||
}
|
||||
|
||||
static float gaussian_nms_kernel
|
||||
(
|
||||
float iou,
|
||||
float sigma
|
||||
)
|
||||
{
|
||||
return (float)(exp(-1.0f * iou * iou / sigma));
|
||||
}
|
||||
|
||||
void swap_element
|
||||
(
|
||||
uint32_t* list,
|
||||
uint32_t first,
|
||||
uint32_t second
|
||||
)
|
||||
{
|
||||
uint32_t temp = list[first];
|
||||
list[first] = list[second];
|
||||
list[second] = temp;
|
||||
}
|
||||
|
||||
uint32_t max_element
|
||||
(
|
||||
float* data,
|
||||
uint32_t* index_list,
|
||||
uint32_t len
|
||||
)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t max_index = 0;
|
||||
float max_val = data[index_list[0]];
|
||||
for(i = 1; i < len; i++)
|
||||
{
|
||||
float val = data[index_list[i]];
|
||||
if (max_val < val)
|
||||
{
|
||||
max_val = val;
|
||||
max_index = i;
|
||||
}
|
||||
}
|
||||
return max_index;
|
||||
}
|
||||
|
||||
static uint32_t max_comp_func
|
||||
(
|
||||
void* data,
|
||||
int32_t left,
|
||||
int32_t right
|
||||
)
|
||||
{
|
||||
float* fdata = (float*)data;
|
||||
return fdata[left] >= fdata[right];
|
||||
}
|
||||
|
||||
void sort_element_by_score
|
||||
(
|
||||
float* data,
|
||||
uint32_t* index_list,
|
||||
uint32_t len
|
||||
)
|
||||
{
|
||||
vsi_nn_partition(data, 0, len - 1, max_comp_func, TRUE, index_list);
|
||||
}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float* fdata;
|
||||
uint32_t numClasses;
|
||||
} class_comp_param;
|
||||
|
||||
static uint32_t class_comp_func
|
||||
(
|
||||
void* data,
|
||||
int32_t left,
|
||||
int32_t right
|
||||
)
|
||||
{
|
||||
class_comp_param *p = (class_comp_param*)data;
|
||||
float* fdata = p->fdata;
|
||||
uint32_t numClasses = p->numClasses;
|
||||
uint32_t lhsClass = left % numClasses, rhsClass = right % numClasses;
|
||||
return lhsClass == rhsClass ? fdata[left] > fdata[right]
|
||||
: lhsClass < rhsClass;
|
||||
}
|
||||
|
||||
static void sort_element_by_class
|
||||
(
|
||||
float* data,
|
||||
uint32_t* index_list,
|
||||
uint32_t len,
|
||||
uint32_t numClasses
|
||||
)
|
||||
{
|
||||
class_comp_param class_comp;
|
||||
class_comp.fdata = data;
|
||||
class_comp.numClasses = numClasses;
|
||||
vsi_nn_partition(&class_comp, 0, len - 1, class_comp_func, TRUE, index_list);
|
||||
}
|
||||
|
||||
// Taking two indices of bounding boxes, return the intersection-of-union.
|
||||
float getIoUAxisAligned
|
||||
(
|
||||
const float* roi1,
|
||||
const float* roi2
|
||||
)
|
||||
{
|
||||
const float area1 = (roi1[2] - roi1[0]) * (roi1[3] - roi1[1]);
|
||||
const float area2 = (roi2[2] - roi2[0]) * (roi2[3] - roi2[1]);
|
||||
const float x1 = vsi_nn_max(roi1[0], roi2[0]);
|
||||
const float x2 = vsi_nn_min(roi1[2], roi2[2]);
|
||||
const float y1 = vsi_nn_max(roi1[1], roi2[1]);
|
||||
const float y2 = vsi_nn_min(roi1[3], roi2[3]);
|
||||
const float w = vsi_nn_max(x2 - x1, 0.0f);
|
||||
const float h = vsi_nn_max(y2 - y1, 0.0f);
|
||||
const float areaIntersect = w * h;
|
||||
const float areaUnion = area1 + area2 - areaIntersect;
|
||||
return areaIntersect / areaUnion;
|
||||
}
|
||||
|
||||
static vsi_status VX_CALLBACK vxBox_with_nms_limitKernel
|
||||
(
|
||||
vx_node node,
|
||||
const vx_reference* paramObj,
|
||||
uint32_t paramNum
|
||||
)
|
||||
{
|
||||
#define ARG_NUM (5)
|
||||
#define TENSOR_NUM_INPUT (3)
|
||||
#define TENSOR_NUM_OUTPUT (4)
|
||||
#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_context context = NULL;
|
||||
vx_tensor input[TENSOR_NUM_INPUT] = {0};
|
||||
vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
|
||||
float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
|
||||
int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0};
|
||||
float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
|
||||
int32_t* int32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
|
||||
vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
|
||||
vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
|
||||
uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
|
||||
uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
|
||||
|
||||
float scoreThreshold;
|
||||
int32_t maxNumDetections;
|
||||
int32_t nms_kernel_method;
|
||||
float iou_threshold;
|
||||
float sigma;
|
||||
float nms_score_threshold;
|
||||
|
||||
uint32_t i = 0;
|
||||
for(i = 0; i < TENSOR_NUM_INPUT; i++)
|
||||
{
|
||||
memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
|
||||
}
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
|
||||
{
|
||||
memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
|
||||
}
|
||||
/* prepare data */
|
||||
context = vxGetContext((vx_reference)node);
|
||||
|
||||
for(i = 0; i < TENSOR_NUM_INPUT; i ++)
|
||||
{
|
||||
input[i] = (vx_tensor)paramObj[i];
|
||||
status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
|
||||
if (i == 2)
|
||||
{
|
||||
int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context,
|
||||
input[i], &in_attr[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
|
||||
status = vsi_nn_vxConvertTensorToFloat32Data(
|
||||
context, input[i], &in_attr[i], f32_in_buffer[i],
|
||||
in_elements[i] * sizeof(float));
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
}
|
||||
}
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
|
||||
{
|
||||
output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
|
||||
status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
|
||||
if (i < 2)
|
||||
{
|
||||
f32_out_buffer[i] = (float *)malloc(out_elements[i] * sizeof(float));
|
||||
memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float));
|
||||
}
|
||||
else
|
||||
{
|
||||
int32_out_buffer[i] = (int32_t *)malloc(out_elements[i] * sizeof(int32_t));
|
||||
memset(int32_out_buffer[i], 0, out_elements[i] * sizeof(int32_t));
|
||||
}
|
||||
}
|
||||
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(scoreThreshold),
|
||||
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(maxNumDetections),
|
||||
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(nms_kernel_method),
|
||||
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(iou_threshold),
|
||||
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(sigma),
|
||||
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(nms_score_threshold),
|
||||
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
/* TODO: Add CPU kernel implement */
|
||||
{
|
||||
uint32_t j, n, b, c;
|
||||
const uint32_t kRoiDim = 4;
|
||||
uint32_t numRois = in_attr[0].size[1];
|
||||
uint32_t numClasses = in_attr[0].size[0];
|
||||
int32_t ind;
|
||||
|
||||
uint32_t * batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t));
|
||||
int32_t numBatch = 0;
|
||||
uint32_t * select = NULL;
|
||||
uint32_t select_size = 0;
|
||||
uint32_t scores_index = 0;
|
||||
uint32_t roi_index = 0;
|
||||
uint32_t roi_out_index = 0;
|
||||
|
||||
memset(batch_data, 0, numRois * sizeof(uint32_t));
|
||||
for (i = 0, ind = -1; i < numRois; i++)
|
||||
{
|
||||
if (int32_in_buffer[2][i] != ind)
|
||||
{
|
||||
ind = int32_in_buffer[2][i];
|
||||
numBatch++;
|
||||
}
|
||||
batch_data[numBatch - 1]++;
|
||||
}
|
||||
select = (uint32_t*)malloc(numBatch * numRois
|
||||
* numClasses * sizeof(uint32_t));
|
||||
memset(select, 0, numBatch * numRois * numClasses * sizeof(uint32_t));
|
||||
for (n = 0; n < (uint32_t)numBatch; n++)
|
||||
{
|
||||
int32_t numDetections_batch = 0;
|
||||
uint32_t select_start_batch = select_size;
|
||||
uint32_t select_len = 0;
|
||||
// Exclude class 0 (background)
|
||||
for (c = 1; c < numClasses; c++)
|
||||
{
|
||||
uint32_t select_start = select_size;
|
||||
int32_t maxNumDetections0 = maxNumDetections;
|
||||
uint32_t numDetections = 0;
|
||||
for (b = 0; b < batch_data[n]; b++)
|
||||
{
|
||||
uint32_t index = b * numClasses + c;
|
||||
float score = f32_in_buffer[0][scores_index + index];
|
||||
if (score > scoreThreshold) {
|
||||
select[select_size] = index;
|
||||
select_size++;
|
||||
}
|
||||
}
|
||||
select_len = select_size - select_start;
|
||||
|
||||
if (maxNumDetections0 < 0)
|
||||
{
|
||||
maxNumDetections0 = select_len;
|
||||
}
|
||||
|
||||
for (j = 0; (j < select_len && numDetections < (uint32_t)maxNumDetections0); j++)
|
||||
{
|
||||
// find max score and swap to the front.
|
||||
int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]),
|
||||
&(select[select_start + j]), select_len - j) + j;
|
||||
|
||||
swap_element(&(select[select_start]), max_index, j);
|
||||
|
||||
// Calculate IoU of the rest, swap to the end (disgard) if needed.
|
||||
for (i = j + 1; i < select_len; i++)
|
||||
{
|
||||
int32_t roiBase0 = roi_index + select[select_start + i] * kRoiDim;
|
||||
int32_t roiBase1 = roi_index + select[select_start + j] * kRoiDim;
|
||||
float iou = getIoUAxisAligned(&(f32_in_buffer[1][roiBase0]),
|
||||
&(f32_in_buffer[1][roiBase1]));
|
||||
float kernel_iou;
|
||||
if (nms_kernel_method == 0)
|
||||
{
|
||||
kernel_iou = hard_nms_kernel(iou, iou_threshold);
|
||||
}
|
||||
else if (nms_kernel_method == 1)
|
||||
{
|
||||
kernel_iou = linear_nms_kernel(iou, iou_threshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel_iou = gaussian_nms_kernel(iou, sigma);
|
||||
|
||||
}
|
||||
f32_in_buffer[0][scores_index + select[select_start + i]] *= kernel_iou;
|
||||
if (f32_in_buffer[0][scores_index + select[select_start + i]] < nms_score_threshold)
|
||||
{
|
||||
swap_element(&(select[select_start]), i, select_len - 1);
|
||||
i--;
|
||||
select_len--;
|
||||
}
|
||||
}
|
||||
numDetections++;
|
||||
}
|
||||
select_size = select_start + select_len;
|
||||
numDetections_batch += numDetections;
|
||||
}
|
||||
|
||||
// Take top maxNumDetections.
|
||||
sort_element_by_score(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
|
||||
numDetections_batch);
|
||||
|
||||
if (numDetections_batch > maxNumDetections)
|
||||
{
|
||||
select_size = select_start_batch + maxNumDetections;
|
||||
}
|
||||
select_len = select_size - select_start_batch;
|
||||
// Sort again by class.
|
||||
sort_element_by_class(&(f32_in_buffer[0][scores_index]), &(select[select_start_batch]),
|
||||
select_len, numClasses);
|
||||
|
||||
for (i = 0; i < select_len; i++)
|
||||
{
|
||||
int32_t in_index0 = scores_index + select[select_start_batch + i];
|
||||
int32_t in_index1 = roi_index + select[select_start_batch + i] * kRoiDim;
|
||||
f32_out_buffer[0][roi_out_index] = f32_in_buffer[0][in_index0];
|
||||
memcpy(&(f32_out_buffer[1][roi_out_index * kRoiDim]),
|
||||
&f32_in_buffer[1][in_index1], kRoiDim * sizeof(float));
|
||||
int32_out_buffer[2][roi_out_index] = select[select_start_batch + i] % numClasses;
|
||||
int32_out_buffer[3][roi_out_index] = n;
|
||||
roi_out_index++;
|
||||
}
|
||||
|
||||
scores_index += batch_data[n] * numClasses;
|
||||
roi_index += batch_data[n] * numClasses * kRoiDim;
|
||||
}
|
||||
if (batch_data) free(batch_data);
|
||||
if (select) free(select);
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
|
||||
{
|
||||
if (i < 2)
|
||||
{
|
||||
status = vsi_nn_vxConvertFloat32DataToTensor(
|
||||
context, output[i], &out_attr[i], f32_out_buffer[i],
|
||||
out_elements[i] * sizeof(float));
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
}
|
||||
else
|
||||
{
|
||||
vsi_nn_vxCopyDataToTensor(context, output[i], &out_attr[i],
|
||||
(uint8_t *)int32_out_buffer[i]);
|
||||
}
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < TENSOR_NUM_INPUT; i++)
|
||||
{
|
||||
if (f32_in_buffer[i]) free(f32_in_buffer[i]);
|
||||
if (int32_in_buffer[i]) free(int32_in_buffer[i]);
|
||||
}
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
|
||||
{
|
||||
if (f32_out_buffer[i]) free(f32_out_buffer[i]);
|
||||
if (int32_out_buffer[i]) free(int32_out_buffer[i]);
|
||||
}
|
||||
return status;
|
||||
} /* _VX_KERNEL_FUNC_KERNEL() */
|
||||
|
||||
static vx_param_description_t vxBox_with_nms_limitKernelParam[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
vx_status VX_CALLBACK vxBox_with_nms_limitInitializer
|
||||
(
|
||||
vx_node nodObj,
|
||||
const vx_reference *paramObj,
|
||||
vx_uint32 paraNum
|
||||
)
|
||||
{
|
||||
vx_status status = VX_SUCCESS;
|
||||
/*TODO: Add initial code for VX program*/
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
vx_kernel_description_t vxBox_with_nms_limit_CPU =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
_VX_KERNEL_NAME,
|
||||
_VX_KERNEL_FUNC_KERNEL,
|
||||
vxBox_with_nms_limitKernelParam,
|
||||
_cnt_of_array( vxBox_with_nms_limitKernelParam ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxBox_with_nms_limit_VX =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
_VX_KERNEL_NAME,
|
||||
NULL,
|
||||
vxBox_with_nms_limitKernelParam,
|
||||
_cnt_of_array( vxBox_with_nms_limitKernelParam ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxBox_with_nms_limitInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t * vx_kernel_BOX_WITH_NMS_LIMIT_list[] =
|
||||
{
|
||||
&vxBox_with_nms_limit_CPU,
|
||||
&vxBox_with_nms_limit_VX,
|
||||
NULL
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,250 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "vsi_nn_platform.h"
|
||||
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_test.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_math.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _VX_KERNEL_VAR (vx_kernel_EXTRA_ENDING)
|
||||
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_EXTRA_ENDING)
|
||||
#define _VX_KERNEL_FUNC_KERNEL (vxExtra_endingKernel)
|
||||
|
||||
static vsi_status VX_CALLBACK vxExtra_endingKernel
|
||||
(
|
||||
vx_node node,
|
||||
const vx_reference* paramObj,
|
||||
uint32_t paramNum
|
||||
)
|
||||
{
|
||||
#define TENSOR_NUM_INPUT (2)
|
||||
#define TENSOR_NUM_OUTPUT (1)
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_context context = NULL;
|
||||
vx_tensor input = NULL;
|
||||
vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
|
||||
uint8_t *u8_in_buffer[1] = {0};
|
||||
uint8_t *u8_out_buffer[TENSOR_NUM_OUTPUT] = {0};
|
||||
vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
|
||||
uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
|
||||
vsi_nn_tensor_attr_t in_attr;
|
||||
|
||||
int32_t i = 0;
|
||||
|
||||
memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
|
||||
{
|
||||
memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
|
||||
}
|
||||
/* prepare data */
|
||||
context = vxGetContext((vx_reference)node);
|
||||
|
||||
input = (vx_tensor)paramObj[1];
|
||||
status = vsi_nn_vxGetTensorAttr(input, &in_attr);
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
|
||||
for(i = 0; i < 1; i ++)
|
||||
{
|
||||
output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
|
||||
status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
|
||||
u8_out_buffer[i]= (uint8_t *)malloc(out_elements[i] * sizeof(uint8_t));
|
||||
memset(u8_out_buffer[i], 0, out_elements[i] * sizeof(uint8_t));
|
||||
|
||||
u8_in_buffer[0] = vsi_nn_vxCopyTensorToData(context, input, &in_attr);
|
||||
memcpy(u8_out_buffer[0], u8_in_buffer[0], out_elements[i] * sizeof(uint8_t));
|
||||
}
|
||||
|
||||
/* save data */
|
||||
status = vsi_nn_vxCopyDataToTensor(context, output[0], &out_attr[0], u8_out_buffer[0]);
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
|
||||
final:
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
|
||||
{
|
||||
if (u8_out_buffer[i]) free(u8_out_buffer[i]);
|
||||
}
|
||||
if (u8_in_buffer[0]) free(u8_in_buffer[0]);
|
||||
return status;
|
||||
} /* _VX_KERNEL_FUNC_KERNEL() */
|
||||
|
||||
static vx_param_description_t vxExtra_endingKernelParam[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
vx_status VX_CALLBACK vxExtra_endingInitializer
|
||||
(
|
||||
vx_node nodObj,
|
||||
const vx_reference *paramObj,
|
||||
vx_uint32 paraNum
|
||||
)
|
||||
{
|
||||
vx_status status = VX_SUCCESS;
|
||||
// Alignment with a power of two value.
|
||||
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
|
||||
vx_kernel_execution_parameters_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vx_tensor output = (vx_tensor)paramObj[2];
|
||||
|
||||
vx_uint32 width = 0;
|
||||
vx_uint32 height = 0;
|
||||
vx_uint32 channel = 0;
|
||||
vx_uint32 dst_size[4] = {1, 1, 1, 1};
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
uint32_t i;
|
||||
uint32_t output_dims;
|
||||
|
||||
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
status = vsi_nn_vxGetTensorAttr(output, &attr);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
|
||||
return status;
|
||||
}
|
||||
output_dims = attr.dim_num;
|
||||
for (i = 0; i < output_dims; i++)
|
||||
{
|
||||
dst_size[i] = attr.size[i];
|
||||
}
|
||||
|
||||
width = dst_size[0];
|
||||
height = dst_size[1];
|
||||
channel = dst_size[2];
|
||||
|
||||
shaderParam.globalWorkOffset[0] = 0;
|
||||
shaderParam.globalWorkOffset[1] = 0;
|
||||
shaderParam.globalWorkOffset[2] = 0;
|
||||
shaderParam.globalWorkScale[0] = 8;
|
||||
shaderParam.globalWorkScale[1] = 1;
|
||||
shaderParam.globalWorkScale[2] = 1;
|
||||
shaderParam.localWorkSize[0] = 16;
|
||||
shaderParam.localWorkSize[1] = 1;
|
||||
shaderParam.localWorkSize[2] = 1;
|
||||
shaderParam.globalWorkSize[0] = gcmALIGN((width + shaderParam.globalWorkScale[0] - 1)
|
||||
/ shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
|
||||
shaderParam.globalWorkSize[1] = gcmALIGN((height + shaderParam.globalWorkScale[1] - 1)
|
||||
/ shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
|
||||
shaderParam.globalWorkSize[2] = channel;
|
||||
|
||||
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
|
||||
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
|
||||
|
||||
if(status < 0)
|
||||
VSILOGE("error-%s,%d\n",__FILE__,__LINE__);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
vx_kernel_description_t vxExtra_ending_CPU =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
VX_KERNEL_NAME_EXTRA_ENDING_I16,
|
||||
_VX_KERNEL_FUNC_KERNEL,
|
||||
vxExtra_endingKernelParam,
|
||||
_cnt_of_array( vxExtra_endingKernelParam ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxExtra_ending_i16 =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
VX_KERNEL_NAME_EXTRA_ENDING_I16,
|
||||
NULL,
|
||||
vxExtra_endingKernelParam,
|
||||
_cnt_of_array( vxExtra_endingKernelParam ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxExtra_endingInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxExtra_ending_i8 =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
VX_KERNEL_NAME_EXTRA_ENDING_I8,
|
||||
NULL,
|
||||
vxExtra_endingKernelParam,
|
||||
_cnt_of_array( vxExtra_endingKernelParam ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxExtra_endingInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxExtra_ending_u8 =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
VX_KERNEL_NAME_EXTRA_ENDING_U8,
|
||||
NULL,
|
||||
vxExtra_endingKernelParam,
|
||||
_cnt_of_array( vxExtra_endingKernelParam ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxExtra_endingInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t * vx_kernel_EXTRA_ENDING_list[] =
|
||||
{
|
||||
&vxExtra_ending_CPU,
|
||||
&vxExtra_ending_i16,
|
||||
&vxExtra_ending_i8,
|
||||
&vxExtra_ending_u8,
|
||||
NULL
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,322 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "vsi_nn_platform.h"
|
||||
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_test.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_math.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _VX_KERNEL_VAR (vx_kernel_HEATMAP_MAX_KEYPOINT)
|
||||
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_HEATMAP_MAX_KEYPOINT)
|
||||
#define _VX_KERNEL_NAME (VX_KERNEL_NAME_HEATMAP_MAX_KEYPOINT)
|
||||
#define _VX_KERNEL_FUNC_KERNEL (vxHeatmap_max_keypointKernel)
|
||||
|
||||
// This function uses Taylor expansion up to the quatratic term to approximate bicubic
|
||||
// upscaling result.
|
||||
// 2nd order Taylor expansion: D(x) = D - b'x + 1/2 * x'Ax
|
||||
// where D = grid[1][1], Taylor expansion center, the original score,
|
||||
// x = delta, the correction on max keypoint position,
|
||||
// D(x) = deltaScore, the accuracy score after correction
|
||||
static void solveForDelta
|
||||
(
|
||||
const float grid[3][3],
|
||||
float* delta,
|
||||
float* deltaScore,
|
||||
float fpAtol,
|
||||
float fpRtol
|
||||
)
|
||||
{
|
||||
// b: negative 1st order derivative at center
|
||||
// A: Hessian matrix at center (2nd order derivative)
|
||||
float A[2][2], b[2];
|
||||
float crossProd1, crossProd2;
|
||||
float detA;
|
||||
b[0] = -(grid[1][2] - grid[1][0]) / 2.0f;
|
||||
b[1] = -(grid[2][1] - grid[0][1]) / 2.0f;
|
||||
A[0][0] = grid[1][0] - 2.0f * grid[1][1] + grid[1][2];
|
||||
A[0][1] = (grid[2][2] - grid[2][0] - grid[0][2] + grid[0][0]) / 4.0f;
|
||||
A[1][0] = A[0][1];
|
||||
A[1][1] = grid[0][1] - 2.0f * grid[1][1] + grid[2][1];
|
||||
|
||||
// solve Ax=b, where x=delta -> delta = inv(A) * b
|
||||
crossProd1 = A[0][0] * A[1][1];
|
||||
crossProd2 = A[0][1] * A[1][0];
|
||||
detA = crossProd1 - crossProd2;
|
||||
// check if A is invertible
|
||||
if (fabs(detA) < (fpAtol + fpRtol * crossProd1)) return;
|
||||
delta[0] = (A[1][1] * b[0] - A[0][1] * b[1]) / detA;
|
||||
delta[1] = (A[0][0] * b[1] - A[1][0] * b[0]) / detA;
|
||||
|
||||
// clip out of range delta, i.e. delta > 3/2
|
||||
if (fabs(delta[0]) > 1.5f || fabs(delta[1]) > 1.5f)
|
||||
{
|
||||
float scale = (float)(1.5f / vsi_nn_max(fabs(delta[0]), fabs(delta[1])));
|
||||
delta[0] *= scale;
|
||||
delta[1] *= scale;
|
||||
}
|
||||
|
||||
*deltaScore = grid[1][1] - b[0] * delta[0] - b[1] * delta[1] +
|
||||
((A[0][0] * delta[0] + A[0][1] * delta[1]) * delta[0] +
|
||||
(A[1][0] * delta[0] + A[1][1] * delta[1]) * delta[1]) /
|
||||
2.0f;
|
||||
}
|
||||
|
||||
static vsi_status VX_CALLBACK vxHeatmap_max_keypointKernel
|
||||
(
|
||||
vx_node node,
|
||||
const vx_reference* paramObj,
|
||||
uint32_t paramNum
|
||||
)
|
||||
{
|
||||
#define ARG_NUM (1)
|
||||
#define TENSOR_NUM_INPUT (2)
|
||||
#define TENSOR_NUM_OUTPUT (2)
|
||||
#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
|
||||
|
||||
vsi_status status = VSI_FAILURE;
|
||||
vx_context context = NULL;
|
||||
vx_tensor input[TENSOR_NUM_INPUT] = {0};
|
||||
vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
|
||||
float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
|
||||
float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
|
||||
vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
|
||||
vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
|
||||
uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
|
||||
uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
|
||||
|
||||
int32_t type;
|
||||
|
||||
uint32_t i = 0;
|
||||
for(i = 0; i < TENSOR_NUM_INPUT; i++)
|
||||
{
|
||||
memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
|
||||
}
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
|
||||
{
|
||||
memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
|
||||
}
|
||||
/* prepare data */
|
||||
context = vxGetContext((vx_reference)node);
|
||||
|
||||
for(i = 0; i < TENSOR_NUM_INPUT; i ++)
|
||||
{
|
||||
input[i] = (vx_tensor)paramObj[i];
|
||||
status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
|
||||
f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
|
||||
status = vsi_nn_vxConvertTensorToFloat32Data(
|
||||
context, input[i], &in_attr[i], f32_in_buffer[i],
|
||||
in_elements[i] * sizeof(float));
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
}
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
|
||||
{
|
||||
output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
|
||||
status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
|
||||
f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float));
|
||||
memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float));
|
||||
}
|
||||
vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(type),
|
||||
VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
|
||||
/* TODO: Add CPU kernel implement */
|
||||
{
|
||||
uint32_t j, k;
|
||||
uint32_t numBoxes = in_attr[0].size[3];
|
||||
uint32_t heatmapSize = in_attr[0].size[2];
|
||||
uint32_t numKeypoints = in_attr[0].size[0];
|
||||
uint32_t boxInfoLength = 4;
|
||||
uint32_t output_score_index = 0;
|
||||
uint32_t output_keypoint_index = 0;
|
||||
|
||||
for(i = 0; i < numBoxes; i++)
|
||||
{
|
||||
for (j = 0; j < numKeypoints; j++)
|
||||
{
|
||||
uint32_t maxIndex = 0;
|
||||
float maxScore = -FLT_MAX;
|
||||
uint32_t maxIndexWidth;
|
||||
uint32_t maxIndexHeight;
|
||||
float localGrid[3][3];
|
||||
int32_t dh, dw;
|
||||
float delta[2] = {0.0f, 0.0f}, deltaScore;
|
||||
float wRoiStart = f32_in_buffer[1][i * boxInfoLength];
|
||||
float hRoiStart = f32_in_buffer[1][i * boxInfoLength + 1];
|
||||
float wRoiEnd = f32_in_buffer[1][i * boxInfoLength + 2];
|
||||
float hRoiEnd = f32_in_buffer[1][i * boxInfoLength + 3];
|
||||
float roiWidth = wRoiEnd - wRoiStart;
|
||||
float roiHeight = hRoiEnd - hRoiStart;
|
||||
float wRelativePos;
|
||||
float hRelativePos;
|
||||
for (k = 0; k < heatmapSize * heatmapSize; k++)
|
||||
{
|
||||
uint32_t index = i * heatmapSize * heatmapSize * numKeypoints
|
||||
+ k * numKeypoints + j;
|
||||
float val = f32_in_buffer[0][index];
|
||||
if (maxScore < val)
|
||||
{
|
||||
maxScore = val;
|
||||
maxIndex = k;
|
||||
}
|
||||
}
|
||||
maxIndexWidth = maxIndex % heatmapSize;
|
||||
maxIndexHeight = maxIndex / heatmapSize;
|
||||
|
||||
// get local 3x3 grid
|
||||
for (dh = -1; dh <= 1; dh++)
|
||||
{
|
||||
for (dw = -1; dw <= 1; dw++)
|
||||
{
|
||||
// cast uint32_t to int32_t
|
||||
int32_t h = (int32_t)(maxIndexHeight) + dh;
|
||||
int32_t w = (int32_t)(maxIndexWidth) + dw;
|
||||
uint32_t heatmapIndex;
|
||||
|
||||
// use mirroring for out of bound indexing
|
||||
// need to ensure heatmapSize >= 2
|
||||
h = h < 0 ? 1 : (h >= (int32_t)heatmapSize ? heatmapSize - 2 : h);
|
||||
w = w < 0 ? 1 : (w >= (int32_t)heatmapSize ? heatmapSize - 2 : w);
|
||||
|
||||
heatmapIndex = i * heatmapSize * heatmapSize * numKeypoints +
|
||||
(uint32_t)(h) * heatmapSize * numKeypoints +
|
||||
(uint32_t)(w) * numKeypoints + j;
|
||||
localGrid[dh + 1][dw + 1] = f32_in_buffer[0][heatmapIndex];
|
||||
}
|
||||
}
|
||||
deltaScore = maxScore;
|
||||
solveForDelta((const float (*)[3])localGrid, delta, &deltaScore, 1e-3f, 1e-3f);
|
||||
|
||||
wRelativePos = ((float)(maxIndexWidth) + delta[0] + 0.5f) /
|
||||
(float)(heatmapSize);
|
||||
hRelativePos = ((float)(maxIndexHeight) + delta[1] + 0.5f) /
|
||||
(float)(heatmapSize);
|
||||
f32_out_buffer[0][output_score_index] = deltaScore;
|
||||
f32_out_buffer[1][output_keypoint_index] = wRelativePos * roiWidth + wRoiStart;
|
||||
f32_out_buffer[1][output_keypoint_index + 1] = hRelativePos * roiHeight + hRoiStart;
|
||||
output_score_index++;
|
||||
output_keypoint_index +=2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* save data */
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
|
||||
{
|
||||
status = vsi_nn_vxConvertFloat32DataToTensor(
|
||||
context, output[i], &out_attr[i], f32_out_buffer[i],
|
||||
out_elements[i] * sizeof(float));
|
||||
TEST_CHECK_STATUS(status, final);
|
||||
}
|
||||
|
||||
final:
|
||||
for (i = 0; i < TENSOR_NUM_INPUT; i++)
|
||||
{
|
||||
if (f32_in_buffer[i]) free(f32_in_buffer[i]);
|
||||
}
|
||||
for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
|
||||
{
|
||||
if (f32_out_buffer[i]) free(f32_out_buffer[i]);
|
||||
}
|
||||
return status;
|
||||
} /* _VX_KERNEL_FUNC_KERNEL() */
|
||||
|
||||
static vx_param_description_t vxHeatmap_max_keypointKernelParam[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
};
|
||||
|
||||
vx_status VX_CALLBACK vxHeatmap_max_keypointInitializer
|
||||
(
|
||||
vx_node nodObj,
|
||||
const vx_reference *paramObj,
|
||||
vx_uint32 paraNum
|
||||
)
|
||||
{
|
||||
vx_status status = VX_SUCCESS;
|
||||
/*TODO: Add initial code for VX program*/
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
vx_kernel_description_t vxHeatmap_max_keypoint_CPU =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
_VX_KERNEL_NAME,
|
||||
_VX_KERNEL_FUNC_KERNEL,
|
||||
vxHeatmap_max_keypointKernelParam,
|
||||
_cnt_of_array( vxHeatmap_max_keypointKernelParam ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxHeatmap_max_keypoint_VX =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
_VX_KERNEL_NAME,
|
||||
NULL,
|
||||
vxHeatmap_max_keypointKernelParam,
|
||||
_cnt_of_array( vxHeatmap_max_keypointKernelParam ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxHeatmap_max_keypointInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t * vx_kernel_HEATMAP_MAX_KEYPOINT_list[] =
|
||||
{
|
||||
&vxHeatmap_max_keypoint_CPU,
|
||||
&vxHeatmap_max_keypoint_VX,
|
||||
NULL
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,806 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "vsi_nn_platform.h"
|
||||
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_math.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define INPUT_FP16 0
|
||||
#define OUTPUT_FP16 0
|
||||
|
||||
vx_status getFactor(vx_uint32 data, vx_uint32 *factor, vx_uint32 minLimit, vx_uint32 maxLimit, vx_uint32 alignData)
|
||||
{
|
||||
vx_uint32 i = 0;
|
||||
vx_uint32 maxFactor = alignData - 1;
|
||||
vx_status status = VX_FAILURE;
|
||||
|
||||
for (i = minLimit; i <= maxLimit; i ++)
|
||||
{
|
||||
if (data % i == 0)
|
||||
{
|
||||
if (status == VX_FAILURE && data % i == 0)
|
||||
{
|
||||
*factor = i;
|
||||
maxFactor = i;
|
||||
status = VX_SUCCESS;
|
||||
continue;
|
||||
}
|
||||
else if ((i % alignData) < (maxFactor % alignData))
|
||||
{
|
||||
*factor = i;
|
||||
maxFactor = i;
|
||||
status = VX_SUCCESS;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void mySignalFrameFunc
|
||||
(
|
||||
void* imgIn,
|
||||
void* imgOut,
|
||||
uint32_t input_dim,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
uint32_t channel,
|
||||
uint32_t batch,
|
||||
uint32_t frame_len, // window size
|
||||
uint32_t step,
|
||||
uint32_t pad_end,
|
||||
uint32_t pad_val,
|
||||
uint32_t axis,
|
||||
uint32_t *dstW,
|
||||
uint32_t *dstH,
|
||||
uint32_t *dstC,
|
||||
uint32_t *dstB
|
||||
)
|
||||
{
|
||||
uint8_t* tmpIn = (uint8_t*)imgIn;
|
||||
uint8_t* tmpOut = (uint8_t*)imgOut;
|
||||
|
||||
uint32_t i,j,k;
|
||||
uint32_t size = 0;
|
||||
uint32_t iter = 0;
|
||||
|
||||
if(input_dim == 1)
|
||||
{
|
||||
if(axis != 0)
|
||||
{
|
||||
VSILOGE("error.\n");
|
||||
return;
|
||||
}
|
||||
*dstW = frame_len;
|
||||
//*dstH = (len - frame_len) / step + 1;
|
||||
*dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step + 1);
|
||||
*dstC = 1;
|
||||
*dstB = 1;
|
||||
|
||||
size = (*dstW) * sizeof(int16_t);
|
||||
iter = pad_end ? width : (width - frame_len + 1);
|
||||
if(pad_end)
|
||||
{
|
||||
int16_t* output = (int16_t*)tmpOut;
|
||||
int16_t* input = (int16_t*)tmpIn;
|
||||
uint32_t m = 0;
|
||||
for(i = 0, j = 0; i < iter; i += step)
|
||||
{
|
||||
for(m = i; m < frame_len + i; m++)
|
||||
{
|
||||
if(m >= width)
|
||||
{
|
||||
output[j] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output[j] = input[m];
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0, j = 0; i < iter; i += step, j++)
|
||||
{
|
||||
memcpy(tmpOut + j * size, tmpIn + i * sizeof(int16_t), size);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(input_dim == 2)
|
||||
{
|
||||
if(axis == 0)
|
||||
{
|
||||
uint8_t* src = tmpIn;
|
||||
uint8_t* dst = tmpOut;
|
||||
|
||||
*dstH = frame_len;
|
||||
*dstW = width;
|
||||
*dstC = pad_end ? ((height + step - 1) / step) : ((height - frame_len) / step + 1);
|
||||
|
||||
*dstB = 1;
|
||||
|
||||
size = width * frame_len * sizeof(int16_t);
|
||||
iter = pad_end ? (height) : (height - frame_len + 1);
|
||||
if(pad_end)
|
||||
{
|
||||
uint32_t m = 0;
|
||||
size = width * sizeof(int16_t);
|
||||
for(i = 0, j = 0; i < iter; i += step)
|
||||
{
|
||||
for(m = i; m < frame_len + i; m++)
|
||||
{
|
||||
if(m >= height)
|
||||
{
|
||||
memset(dst + j * size, 0, size);
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(dst + j * size, src + m * width * sizeof(int16_t), size);
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0, j = 0; i < iter; i += step, j++)
|
||||
{
|
||||
memcpy(dst + j * size, src + i * width * sizeof(int16_t), size);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(axis == 1)
|
||||
{
|
||||
*dstW = frame_len;
|
||||
|
||||
//*dstH = (len - frame_len) / step + 1;
|
||||
*dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step + 1);
|
||||
|
||||
*dstC = height;
|
||||
*dstB = 1;
|
||||
|
||||
size = (*dstW) * sizeof(int16_t);
|
||||
iter = pad_end ? width : (width - frame_len + 1);
|
||||
if(pad_end)
|
||||
{
|
||||
for(k = 0; k < height; k++)
|
||||
{
|
||||
uint8_t* src = tmpIn + k * width * sizeof(int16_t);
|
||||
uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t);
|
||||
|
||||
int16_t* output = (int16_t*)dst;
|
||||
int16_t* input = (int16_t*)src;
|
||||
uint32_t m = 0;
|
||||
for(i = 0, j = 0; i < iter; i += step)
|
||||
{
|
||||
for(m = i; m < frame_len + i; m++)
|
||||
{
|
||||
if(m >= width)
|
||||
{
|
||||
output[j] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output[j] = input[m];
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(k = 0; k < height; k++)
|
||||
{
|
||||
uint8_t* src = tmpIn + k * width * sizeof(int16_t);
|
||||
uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t);
|
||||
|
||||
for(i = 0, j = 0; i < iter; i += step, j++)
|
||||
{
|
||||
memcpy(dst + j * size, src + i * sizeof(int16_t), size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(input_dim == 3)
|
||||
{
|
||||
if(axis == 0)
|
||||
{
|
||||
uint8_t* src = tmpIn;
|
||||
uint8_t* dst = tmpOut;
|
||||
size = width * height * frame_len * sizeof(int16_t);
|
||||
|
||||
*dstW = width;
|
||||
*dstH = height;
|
||||
*dstC = frame_len;
|
||||
*dstB = pad_end ? ((channel + step - 1) / step) :((channel - frame_len) / step + 1);
|
||||
iter = pad_end ? channel : (channel - frame_len + 1);
|
||||
if(pad_end)
|
||||
{
|
||||
uint32_t m = 0;
|
||||
size = width * height * sizeof(int16_t);
|
||||
for(i = 0, j = 0; i < iter; i += step)
|
||||
{
|
||||
for(m = i; m < frame_len + i; m++)
|
||||
{
|
||||
if(m >= channel)
|
||||
{
|
||||
memset(dst + j * size, 0 , size);
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(dst + j * size, src + m * width * height * sizeof(int16_t), size);
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(i = 0, j = 0; i < iter; i += step, j++)
|
||||
{
|
||||
memcpy(dst + j * size, src + i * width * height * sizeof(int16_t), size);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(axis == 1)
|
||||
{
|
||||
*dstH = frame_len;
|
||||
*dstW = width;
|
||||
*dstC = pad_end ? ((height + step - 1) / step) : ((height - frame_len) / step + 1);
|
||||
*dstB = channel;
|
||||
|
||||
size = width * frame_len * sizeof(int16_t);
|
||||
iter = pad_end ? (height) : (height - frame_len + 1);
|
||||
if(pad_end)
|
||||
{
|
||||
uint32_t m = 0;
|
||||
size = width * sizeof(int16_t);
|
||||
for(k = 0; k < channel; k++)
|
||||
{
|
||||
uint8_t* src = tmpIn + k * width * height* sizeof(int16_t);
|
||||
uint8_t* dst = tmpOut + k * (*dstC) * (*dstW) * (*dstH) * sizeof(int16_t);
|
||||
|
||||
for(i = 0, j = 0; i < iter; i += step)
|
||||
{
|
||||
for(m = i; m < frame_len + i; m++)
|
||||
{
|
||||
if(m >= height)
|
||||
{
|
||||
memset(dst + j * size, 0, size);
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(dst + j * size, src + m * width * sizeof(int16_t), size);
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(k = 0; k < channel; k++)
|
||||
{
|
||||
uint8_t* src = tmpIn + k * width * height* sizeof(int16_t);
|
||||
uint8_t* dst = tmpOut + k * (*dstC) * (*dstW) * (*dstH) * sizeof(int16_t);
|
||||
|
||||
for(i = 0, j = 0; i < iter; i += step, j++)
|
||||
{
|
||||
memcpy(dst + j * size, src + i * width * sizeof(int16_t), size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(axis == 2)
|
||||
{
|
||||
//*dstH = (len - frame_len) / step + 1;
|
||||
*dstH = pad_end ? ((width + step - 1 ) / step) : ((width - frame_len) / step + 1);
|
||||
*dstW = frame_len;
|
||||
*dstC = height;
|
||||
*dstB = channel;
|
||||
|
||||
size = (*dstW) * sizeof(int16_t);
|
||||
iter = pad_end ? width : (width - frame_len + 1);
|
||||
|
||||
if(pad_end)
|
||||
{
|
||||
for(k = 0; k < channel * height; k++)
|
||||
{
|
||||
uint8_t* src = tmpIn + k * width * sizeof(int16_t);
|
||||
uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t);
|
||||
|
||||
int16_t* output = (int16_t*)dst;
|
||||
int16_t* input = (int16_t*)src;
|
||||
uint32_t m = 0;
|
||||
for(i = 0, j = 0; i < iter; i += step)
|
||||
{
|
||||
for(m = i; m < frame_len + i; m++)
|
||||
{
|
||||
if(m >= width)
|
||||
{
|
||||
output[j] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
output[j] = input[m];
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(k = 0; k < channel * height; k++)
|
||||
{
|
||||
uint8_t* src = tmpIn + k * width * sizeof(int16_t);
|
||||
uint8_t* dst = tmpOut + k * (*dstW) * (*dstH) * sizeof(int16_t);
|
||||
for(i = 0, j = 0; i < iter; i += step, j++)
|
||||
{
|
||||
memcpy(dst + j * size, src + i * sizeof(int16_t), size);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
vsi_status VX_CALLBACK vxSignalFrameKernel
|
||||
(
|
||||
vx_node node,
|
||||
const vx_reference* paramObj,
|
||||
uint32_t paramNum
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_ERROR_INVALID_PARAMETERS;
|
||||
|
||||
if(paramNum == 7)
|
||||
{
|
||||
vx_context context = NULL;
|
||||
// tensor
|
||||
vx_tensor imgObj[7] = { NULL };
|
||||
#if INPUT_FP16
|
||||
int16_t *input = NULL;
|
||||
#else
|
||||
uint8_t *input = NULL;
|
||||
#endif
|
||||
#if OUTPUT_FP16
|
||||
int16_t *output = NULL;
|
||||
#else
|
||||
uint8_t *output = NULL;
|
||||
#endif
|
||||
|
||||
uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1}, output_size[DIM_SIZE] = {1, 1, 1, 1}, dst_size[DIM_SIZE] = {1, 1, 1, 1};
|
||||
vsi_nn_tensor_attr_t in_attr, out_attr;
|
||||
|
||||
vsi_nn_type_e outputFormat = VSI_NN_TYPE_FLOAT16;
|
||||
uint32_t input_dims = 0, output_dims = 0, tmpDim = 0;
|
||||
|
||||
vx_scalar scalar[5] = { NULL };
|
||||
uint32_t frame_length = 0, step = 0, pad_end = 0, pad = 0, axis = 0, axis0 = 0;
|
||||
uint32_t i = 0;
|
||||
|
||||
memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
|
||||
memset(&out_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
|
||||
status = vsi_nn_vxGetTensorAttr(imgObj[0], &in_attr);
|
||||
status |= vsi_nn_vxGetTensorAttr(imgObj[1], &out_attr);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
|
||||
goto OnError;
|
||||
}
|
||||
|
||||
imgObj[0] = (vx_tensor)paramObj[0];
|
||||
imgObj[1] = (vx_tensor)paramObj[1]; //output
|
||||
scalar[0] = (vx_scalar)paramObj[2];
|
||||
scalar[1] = (vx_scalar)paramObj[3];
|
||||
scalar[2] = (vx_scalar)paramObj[4];
|
||||
scalar[3] = (vx_scalar)paramObj[5];
|
||||
scalar[4] = (vx_scalar)paramObj[6];
|
||||
context = vxGetContext((vx_reference)node);
|
||||
if (context == NULL)
|
||||
{
|
||||
VSILOGE("vxGetContext failure! at line %d\n", __LINE__);
|
||||
goto OnError;
|
||||
}
|
||||
//input
|
||||
input_dims = in_attr.dim_num;
|
||||
for (i = 0; i < input_dims; i++)
|
||||
{
|
||||
input_size[i] = in_attr.size[i];
|
||||
}
|
||||
|
||||
//output
|
||||
output_dims = out_attr.dim_num;
|
||||
outputFormat = out_attr.dtype.vx_type;
|
||||
for (i = 0; i < output_dims; i++)
|
||||
{
|
||||
output_size[i] = out_attr.size[i];
|
||||
}
|
||||
|
||||
input_size[2] = (input_dims <= 2)?1:input_size[2];
|
||||
input_size[3] = (input_dims <= 3)?1:input_size[3];
|
||||
|
||||
|
||||
#if INPUT_FP16
|
||||
input = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t));
|
||||
#else
|
||||
//input = (uint8_t*)malloc(input_size[0]*input_size[1]*input_size[2]*vsi_nn_GetTypeBytes(inputFormat));
|
||||
#endif
|
||||
#if OUTPUT_FP16
|
||||
output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t));
|
||||
#else
|
||||
output = (uint8_t*)malloc(output_size[0]*output_size[1]*output_size[2]*vsi_nn_GetTypeBytes(outputFormat));
|
||||
#endif
|
||||
|
||||
input = vsi_nn_vxCopyTensorToData(context, imgObj[0], &in_attr);
|
||||
|
||||
// scalar
|
||||
status = vxCopyScalar(scalar[0], &frame_length, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
status |= vxCopyScalar(scalar[1], &step, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
status |= vxCopyScalar(scalar[2], &pad_end, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
status |= vxCopyScalar(scalar[3], &pad, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
status |= vxCopyScalar(scalar[4], &axis, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__);
|
||||
goto OnError;
|
||||
}
|
||||
|
||||
// Call C Prototype
|
||||
if(output_dims == 2)
|
||||
tmpDim = 1;
|
||||
else
|
||||
tmpDim = input_dims;
|
||||
{
|
||||
axis0 = input_dims - axis - 1;
|
||||
}
|
||||
mySignalFrameFunc(input, output, tmpDim, input_size[0],
|
||||
input_size[1], input_size[2], input_size[3],
|
||||
frame_length, step, pad_end, pad, axis0,
|
||||
&dst_size[0], &dst_size[1], &dst_size[2], &dst_size[3]);
|
||||
|
||||
//output tensor
|
||||
status = vsi_nn_vxCopyDataToTensor(context, imgObj[1], &out_attr, output);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxCopyDataToTensor failure! at line %d\n", __LINE__);
|
||||
goto OnError;
|
||||
}
|
||||
|
||||
OnError:
|
||||
if(input) free(input);
|
||||
if(output) free(output);
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
vsi_status VX_CALLBACK vxSignalFrameInitializer
|
||||
(
|
||||
vx_node nodObj,
|
||||
const vx_reference *paramObj,
|
||||
uint32_t paraNum
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_SUCCESS;
|
||||
// Alignment with a power of two value.
|
||||
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
|
||||
vx_kernel_execution_parameters_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
vx_scalar scalar[5];
|
||||
vx_tensor input = (vx_tensor)paramObj[0];
|
||||
vx_tensor output = (vx_tensor)paramObj[1];
|
||||
|
||||
uint32_t input_size[DIM_SIZE] = {1, 1, 1, 1};
|
||||
uint32_t input_dims = 0;
|
||||
uint32_t output_dims = 0;
|
||||
//vx_uint32 factor = 1;
|
||||
//vx_uint32 maxWorkGroupSize = 8;
|
||||
uint32_t frame_length, step, pad_end, pad, axis, axis0;
|
||||
uint32_t output_channel = 0;
|
||||
|
||||
vx_uint32 i = 0;
|
||||
vsi_nn_tensor_attr_t attr[2];
|
||||
memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
status = vsi_nn_vxGetTensorAttr(input, &attr[0]);
|
||||
status |= vsi_nn_vxGetTensorAttr(output, &attr[1]);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
|
||||
return status;
|
||||
}
|
||||
|
||||
input_dims = attr[0].dim_num;
|
||||
for (i = 0; i < input_dims; i++)
|
||||
{
|
||||
input_size[i] = attr[0].size[i];
|
||||
}
|
||||
output_dims = attr[1].dim_num;
|
||||
|
||||
scalar[0] = (vx_scalar)paramObj[2];
|
||||
scalar[1] = (vx_scalar)paramObj[3];
|
||||
scalar[2] = (vx_scalar)paramObj[4];
|
||||
scalar[3] = (vx_scalar)paramObj[5];
|
||||
scalar[4] = (vx_scalar)paramObj[6];
|
||||
|
||||
status = vxCopyScalar(scalar[0], &frame_length, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
status |= vxCopyScalar(scalar[1], &step, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
status |= vxCopyScalar(scalar[2], &pad_end, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
status |= vxCopyScalar(scalar[3], &pad, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
status |= vxCopyScalar(scalar[4], &axis0, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__);
|
||||
return status;
|
||||
}
|
||||
|
||||
{
|
||||
if(input_dims == 2 && output_dims == 2)
|
||||
{
|
||||
axis = input_dims - axis0 - 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
axis = input_dims - axis0 - 1;
|
||||
}
|
||||
}
|
||||
|
||||
input_size[2] = (input_dims <= 2)?1:input_size[2];
|
||||
//input_size[2] = (input_dims == 4)?(input_size[2] * input_size[3]):input_size[2];
|
||||
|
||||
shaderParam.globalWorkOffset[0] = 0;
|
||||
shaderParam.globalWorkOffset[1] = 0;
|
||||
shaderParam.globalWorkOffset[2] = 0;
|
||||
if((output_dims == 2)
|
||||
|| (input_dims == 2 && output_dims == 3 && axis == 1)
|
||||
|| (input_dims == 3 && axis == 2))
|
||||
{
|
||||
shaderParam.globalWorkScale[0] = 1;
|
||||
shaderParam.globalWorkScale[1] = 1;
|
||||
shaderParam.globalWorkScale[2] = 1;
|
||||
shaderParam.localWorkSize[0] = 1;
|
||||
shaderParam.localWorkSize[1] = 1;
|
||||
#if 0
|
||||
if (input_size[1] <= maxWorkGroupSize)
|
||||
shaderParam.localWorkSize[1] = input_size[1];
|
||||
else if (getFactor(input_size[1], &factor, 2, maxWorkGroupSize, 8) == VX_SUCCESS)
|
||||
shaderParam.localWorkSize[1] = factor;
|
||||
else
|
||||
shaderParam.localWorkSize[1] = 1;
|
||||
#endif
|
||||
|
||||
shaderParam.localWorkSize[2] = 1;
|
||||
shaderParam.globalWorkSize[0] = gcmALIGN((1 + shaderParam.globalWorkScale[0] - 1)
|
||||
/ shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
|
||||
|
||||
shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
|
||||
/ shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
|
||||
//shaderParam.globalWorkSize[1] = input_size[1];
|
||||
shaderParam.globalWorkSize[2] = gcmALIGN((input_size[2] + shaderParam.globalWorkScale[2] - 1)
|
||||
/ shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]);
|
||||
}
|
||||
else if((input_dims == 2 && output_dims == 3 && axis == 0)
|
||||
|| (input_dims == 3 && axis == 1))
|
||||
{
|
||||
int height = (pad_end == 0) ? (input_size[1] - frame_length + 1) : (input_size[1]);
|
||||
shaderParam.globalWorkScale[0] = 8;
|
||||
shaderParam.globalWorkScale[1] = step;
|
||||
shaderParam.globalWorkScale[2] = 1;
|
||||
shaderParam.localWorkSize[0] = 1;
|
||||
shaderParam.localWorkSize[1] = 1;
|
||||
shaderParam.localWorkSize[2] = 1;
|
||||
shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
|
||||
/ shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
|
||||
shaderParam.globalWorkSize[1] = gcmALIGN((height + shaderParam.globalWorkScale[1] - 1)
|
||||
/ shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
|
||||
shaderParam.globalWorkSize[2] = gcmALIGN((input_size[2] + shaderParam.globalWorkScale[2] - 1)
|
||||
/ shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]);
|
||||
|
||||
output_channel = (pad_end == 0) ? ((input_size[1] - frame_length) / step + 1) : ((input_size[1] + step - 1) / step);
|
||||
}
|
||||
else if(input_dims == 3 && axis == 0)
|
||||
{
|
||||
int channel = (pad_end == 0) ? (input_size[2] - frame_length + 1) : (input_size[2]);
|
||||
shaderParam.globalWorkScale[0] = 8;
|
||||
shaderParam.globalWorkScale[1] = 1;
|
||||
shaderParam.globalWorkScale[2] = step;
|
||||
shaderParam.localWorkSize[0] = 1;
|
||||
shaderParam.localWorkSize[1] = 1;
|
||||
shaderParam.localWorkSize[2] = 1;
|
||||
shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
|
||||
/ shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
|
||||
shaderParam.globalWorkSize[1] = gcmALIGN((input_size[1] + shaderParam.globalWorkScale[1] - 1)
|
||||
/ shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
|
||||
shaderParam.globalWorkSize[2] = gcmALIGN((channel + shaderParam.globalWorkScale[2] - 1)
|
||||
/ shaderParam.globalWorkScale[2], shaderParam.localWorkSize[2]);
|
||||
}
|
||||
|
||||
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
|
||||
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
|
||||
if(status < 0)
|
||||
{
|
||||
VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
|
||||
}
|
||||
{
|
||||
status |= vxSetNodeUniform(nodObj, "input_width", 1, &input_size[0]);
|
||||
status |= vxSetNodeUniform(nodObj, "input_height", 1, &input_size[1]);
|
||||
status |= vxSetNodeUniform(nodObj, "input_channel", 1, &input_size[2]);
|
||||
status |= vxSetNodeUniform(nodObj, "output_channel", 1, &output_channel);
|
||||
if(status < 0)
|
||||
{
|
||||
VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
static vx_param_description_t vxSignalFrameKernelParam[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
vx_kernel_description_t vxSignalFrameKernelInfo =
|
||||
{
|
||||
VX_KERNEL_ENUM_SIGNALFRAME,
|
||||
VX_KERNEL_NAME_SIGNALFRAME_WIDTH,
|
||||
NULL,
|
||||
vxSignalFrameKernelParam,
|
||||
(sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxSignalFrameInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxSignalFrameKernelInfo_height =
|
||||
{
|
||||
VX_KERNEL_ENUM_SIGNALFRAME,
|
||||
VX_KERNEL_NAME_SIGNALFRAME_HEIGHT,
|
||||
NULL,
|
||||
vxSignalFrameKernelParam,
|
||||
(sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxSignalFrameInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxSignalFrameKernelInfo_channel =
|
||||
{
|
||||
VX_KERNEL_ENUM_SIGNALFRAME,
|
||||
VX_KERNEL_NAME_SIGNALFRAME_CHANNEL,
|
||||
NULL,
|
||||
vxSignalFrameKernelParam,
|
||||
(sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxSignalFrameInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxSignalFrameKernelInfo_8bit =
|
||||
{
|
||||
VX_KERNEL_ENUM_SIGNALFRAME,
|
||||
VX_KERNEL_NAME_SIGNALFRAME_WIDTH_8BITS,
|
||||
NULL,
|
||||
vxSignalFrameKernelParam,
|
||||
(sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxSignalFrameInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxSignalFrameKernelInfo_height_8bit =
|
||||
{
|
||||
VX_KERNEL_ENUM_SIGNALFRAME,
|
||||
VX_KERNEL_NAME_SIGNALFRAME_HEIGHT_8BITS,
|
||||
NULL,
|
||||
vxSignalFrameKernelParam,
|
||||
(sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxSignalFrameInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxSignalFrameKernelInfo_channel_8bit =
|
||||
{
|
||||
VX_KERNEL_ENUM_SIGNALFRAME,
|
||||
VX_KERNEL_NAME_SIGNALFRAME_CHANNEL_8BITS,
|
||||
NULL,
|
||||
vxSignalFrameKernelParam,
|
||||
(sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxSignalFrameInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxSignalFrameKernelInfo_CPU =
|
||||
{
|
||||
VX_KERNEL_ENUM_SIGNALFRAME,
|
||||
VX_KERNEL_NAME_SIGNALFRAME_WIDTH,
|
||||
vxSignalFrameKernel,
|
||||
vxSignalFrameKernelParam,
|
||||
(sizeof(vxSignalFrameKernelParam) / sizeof(vxSignalFrameKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t * vx_kernel_SIGNALFRAME_list[] =
|
||||
{
|
||||
&vxSignalFrameKernelInfo_CPU,
|
||||
&vxSignalFrameKernelInfo,
|
||||
&vxSignalFrameKernelInfo_height,
|
||||
&vxSignalFrameKernelInfo_channel,
|
||||
&vxSignalFrameKernelInfo_8bit,
|
||||
&vxSignalFrameKernelInfo_height_8bit,
|
||||
&vxSignalFrameKernelInfo_channel_8bit,
|
||||
NULL
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,481 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "vsi_nn_platform.h"
|
||||
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_math.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _VX_KERNEL_VAR (vx_kernel_SPATIAL_TRANSFORMER)
|
||||
#define _VX_KERNEL_ID (VX_KERNEL_ENUM_SPATIAL_TRANSFORMER)
|
||||
#define _VX_KERNEL_NAME (VX_KERNEL_NAME_SPATIAL_TRANSFORMER)
|
||||
#define _VX_KERNEL_FUNC_KERNEL (vxSpatial_transformerKernel)
|
||||
|
||||
|
||||
static vsi_status VX_CALLBACK vxSpatial_transformerKernel
|
||||
(
|
||||
vx_node node,
|
||||
const vx_reference* paramObj,
|
||||
uint32_t paramNum
|
||||
)
|
||||
{
|
||||
/*To do cpu implementation*/
|
||||
vsi_status status = VX_SUCCESS;
|
||||
|
||||
return status;
|
||||
} /* _VX_KERNEL_FUNC_KERNEL() */
|
||||
|
||||
static vx_param_description_t s_params[] =
|
||||
{
|
||||
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
|
||||
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
|
||||
{ VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
|
||||
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
|
||||
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
|
||||
};
|
||||
|
||||
vx_status VX_CALLBACK vxTransform_GemmInputValidator(vx_node node, vx_uint32 index)
|
||||
{
|
||||
return VX_SUCCESS;
|
||||
}
|
||||
|
||||
vx_status VX_CALLBACK vxTransform_GemmOutputValidator(vx_node node, vx_uint32 index, vx_meta_format metaObj)
|
||||
{
|
||||
return VX_SUCCESS;
|
||||
}
|
||||
|
||||
vx_status VX_CALLBACK vxValidator(vx_node node, const vx_reference parameters[],
|
||||
vx_uint32 num, vx_meta_format metas[])
|
||||
{
|
||||
vx_status status = VX_SUCCESS;
|
||||
vx_uint32 index = 0;
|
||||
for(index = 0; index < num; index++)
|
||||
{
|
||||
if(index < 2)
|
||||
{
|
||||
status |= vxTransform_GemmInputValidator(node,index);
|
||||
}
|
||||
else
|
||||
{
|
||||
status |= vxTransform_GemmOutputValidator(node,index,metas[index]);
|
||||
}
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
static vx_param_description_t vxTransform_GemmKernelParam[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
|
||||
};
|
||||
|
||||
vx_status VX_CALLBACK vxTransform_GemmInitializer(vx_node nodObj, const vx_reference *paramObj, vx_uint32 paraNum)
|
||||
{
|
||||
// Alignment with a power of two value.
|
||||
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
|
||||
#define gcmMIN(x, y) (((x) <= (y)) ? (x) : (y))
|
||||
#define gcmMAX(x, y) (((x) >= (y)) ? (x) : (y))
|
||||
#define MAX_MULTIPLIER_NUM (65535)
|
||||
#define MAX_POST_SHIFT_BITS (31)
|
||||
vx_kernel_execution_parameters_t shaderParam = {
|
||||
2, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vx_status status = VX_SUCCESS;
|
||||
vx_tensor input0 = (vx_tensor)paramObj[0];
|
||||
vx_tensor input1 = (vx_tensor)paramObj[1];
|
||||
vx_tensor output = (vx_tensor)paramObj[2];
|
||||
vx_enum src0Format = VSI_NN_TYPE_FLOAT16;
|
||||
vx_enum src1Format = VSI_NN_TYPE_FLOAT16;
|
||||
vx_enum dstFormat = VSI_NN_TYPE_FLOAT16;
|
||||
vx_uint32 coord_size[4] = {1, 1, 1, 1};
|
||||
vx_uint32 i = 0;
|
||||
vsi_nn_tensor_attr_t attr[3];
|
||||
|
||||
memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
|
||||
status = vsi_nn_vxGetTensorAttr(input0, &attr[0]);
|
||||
status |= vsi_nn_vxGetTensorAttr(input1, &attr[1]);
|
||||
status |= vsi_nn_vxGetTensorAttr(output, &attr[2]);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
|
||||
return status;
|
||||
}
|
||||
|
||||
src0Format = attr[0].dtype.vx_type;
|
||||
src1Format = attr[1].dtype.vx_type;
|
||||
for (i = 0; i < attr[1].dim_num; i++)
|
||||
{
|
||||
coord_size[i] = attr[1].size[i];
|
||||
}
|
||||
dstFormat = attr[2].dtype.vx_type;
|
||||
|
||||
if (src0Format == VSI_NN_TYPE_FLOAT16 && src1Format == VSI_NN_TYPE_FLOAT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
|
||||
{
|
||||
shaderParam.globalWorkScale[0] = 12;
|
||||
shaderParam.globalWorkScale[1] = 1;
|
||||
}
|
||||
|
||||
shaderParam.globalWorkSize[0] =
|
||||
gcmALIGN((coord_size[0] + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], 4);
|
||||
shaderParam.globalWorkSize[1] =
|
||||
(coord_size[1] + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1];
|
||||
{
|
||||
vx_uint32 uniGemm3x3_4x4[16] = {
|
||||
0x15151515, // TCfg
|
||||
0x00000000, // ASelt
|
||||
0x02100210, 0x05430543, // ABin
|
||||
0x15151515, // BSelt
|
||||
0x05430210, 0x05430210, // BBin
|
||||
0x00000400, // AccumType, ConstantType, and PostShift
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
|
||||
};
|
||||
|
||||
vxSetNodeUniform(nodObj, "uniGemm3x3_4x4", 1, uniGemm3x3_4x4);
|
||||
}
|
||||
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
|
||||
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
|
||||
|
||||
return VX_SUCCESS;
|
||||
}
|
||||
|
||||
static vx_param_description_t vxTransform_setupThresKernelParam[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
|
||||
};
|
||||
|
||||
vx_status VX_CALLBACK vxTransform_setupThresInitializer
|
||||
(
|
||||
vx_node nodObj,
|
||||
const vx_reference *paramObj,
|
||||
vx_uint32 paraNum
|
||||
)
|
||||
{
|
||||
// Alignment with a power of two value.
|
||||
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
|
||||
#define gcmMIN(x, y) (((x) <= (y)) ? (x) : (y))
|
||||
#define gcmMAX(x, y) (((x) >= (y)) ? (x) : (y))
|
||||
#define MAX_MULTIPLIER_NUM (65535)
|
||||
#define MAX_POST_SHIFT_BITS (31)
|
||||
vx_kernel_execution_parameters_t shaderParam = {
|
||||
2, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vx_status status = VX_SUCCESS;
|
||||
vx_scalar thresFlag_s = (vx_scalar)paramObj[2];
|
||||
vx_enum src0Format = VSI_NN_TYPE_FLOAT16;
|
||||
vx_enum src1Format = VSI_NN_TYPE_FLOAT16;
|
||||
|
||||
vx_int32 thresFlag = 0;
|
||||
vx_uint32 extract_packed[4] = {0};
|
||||
|
||||
vxCopyScalar(thresFlag_s, &thresFlag, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
|
||||
if(status < 0)
|
||||
VSILOGE("error-%s,%d\n",__FILE__,__LINE__);
|
||||
|
||||
shaderParam.globalWorkScale[0] = 1;
|
||||
shaderParam.globalWorkScale[1] = 1;
|
||||
shaderParam.localWorkSize[0] = 1;
|
||||
shaderParam.localWorkSize[1] = 1;
|
||||
shaderParam.globalWorkSize[0] = 1;
|
||||
shaderParam.globalWorkSize[1] = 1;
|
||||
|
||||
if (src0Format == src1Format && src0Format == VSI_NN_TYPE_FLOAT16)
|
||||
{
|
||||
vx_uint32 i = 0;
|
||||
vx_uint32 j = 0;
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (thresFlag & (1 << i))
|
||||
{
|
||||
extract_packed[0] |= ((i << 4) << (i * 8));
|
||||
}
|
||||
else
|
||||
{
|
||||
extract_packed[0] |= (((j << 4) + 128) << (i * 8));
|
||||
j ++;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 4; i < 6; i++)
|
||||
{
|
||||
if (thresFlag & (1 << i))
|
||||
{
|
||||
extract_packed[1] |= ((i << 4) << (i * 8 - 32));
|
||||
}
|
||||
else
|
||||
{
|
||||
extract_packed[1] |= (((j << 4) + 128) << (i * 8 - 32));
|
||||
j ++;
|
||||
}
|
||||
}
|
||||
|
||||
extract_packed[2] = extract_packed[3] = 0x10101010;
|
||||
}
|
||||
|
||||
vxSetNodeUniform(nodObj, "extract_packed", 1, extract_packed);
|
||||
|
||||
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
|
||||
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
|
||||
|
||||
return VX_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
static vx_param_description_t vxTransform_InterPKernelParam[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
|
||||
};
|
||||
|
||||
|
||||
vx_status VX_CALLBACK vxTransform_InterPInitializer
|
||||
(
|
||||
vx_node nodObj,
|
||||
const vx_reference *paramObj,
|
||||
vx_uint32 paraNum
|
||||
)
|
||||
{
|
||||
// Alignment with a power of two value.
|
||||
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
|
||||
#define gcmMIN(x, y) (((x) <= (y)) ? (x) : (y))
|
||||
#define gcmMAX(x, y) (((x) >= (y)) ? (x) : (y))
|
||||
#define MAX_MULTIPLIER_NUM (65535)
|
||||
#define MAX_POST_SHIFT_BITS (31)
|
||||
vx_kernel_execution_parameters_t shaderParam = {
|
||||
2, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vx_status status = VX_SUCCESS;
|
||||
vx_tensor input0 = (vx_tensor)paramObj[0];
|
||||
vx_tensor input1 = (vx_tensor)paramObj[1];
|
||||
vx_tensor output = (vx_tensor)paramObj[2];
|
||||
vx_enum src0Format = VSI_NN_TYPE_FLOAT16;
|
||||
vx_enum src1Format = VSI_NN_TYPE_FLOAT16;
|
||||
vx_enum dstFormat = VSI_NN_TYPE_FLOAT16;
|
||||
vx_uint32 coord_size[4] = {1, 1, 1, 1};
|
||||
vx_uint32 input_size[4] = {1, 1, 1, 1};
|
||||
vx_uint32 output_size[4] = {1, 1, 1, 1};
|
||||
vx_uint32 i = 0;
|
||||
vsi_nn_tensor_attr_t attr[3];
|
||||
|
||||
memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
memset(&attr[2], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
|
||||
status = vsi_nn_vxGetTensorAttr(input0, &attr[0]);
|
||||
status |= vsi_nn_vxGetTensorAttr(input1, &attr[1]);
|
||||
status |= vsi_nn_vxGetTensorAttr(output, &attr[2]);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
|
||||
return status;
|
||||
}
|
||||
|
||||
for (i = 0; i < attr[0].dim_num; i++)
|
||||
{
|
||||
input_size[i] = attr[0].size[i];
|
||||
}
|
||||
src0Format = attr[0].dtype.vx_type;
|
||||
src1Format = attr[1].dtype.vx_type;
|
||||
for (i = 0; i < attr[1].dim_num; i++)
|
||||
{
|
||||
coord_size[i] = attr[1].size[i];
|
||||
}
|
||||
dstFormat = attr[2].dtype.vx_type;
|
||||
for (i = 0; i < attr[2].dim_num; i++)
|
||||
{
|
||||
output_size[i] = attr[2].size[i];
|
||||
}
|
||||
|
||||
if ((src0Format == VSI_NN_TYPE_FLOAT16 && src1Format == VSI_NN_TYPE_FLOAT16 && dstFormat == VSI_NN_TYPE_FLOAT16)
|
||||
|| (src0Format == VSI_NN_TYPE_INT16 && src1Format == VSI_NN_TYPE_INT16 && dstFormat == VSI_NN_TYPE_INT16))
|
||||
{
|
||||
shaderParam.globalWorkScale[0] = 2;
|
||||
shaderParam.globalWorkScale[1] = 1;
|
||||
}
|
||||
|
||||
shaderParam.globalWorkSize[0] =
|
||||
gcmALIGN((coord_size[0] + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], 4);
|
||||
shaderParam.globalWorkSize[1] =
|
||||
(coord_size[1] + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1];
|
||||
{
|
||||
vx_int32 packedWH2[2] = {input_size[0], input_size[1]};
|
||||
vx_int32 packedWH = (input_size[1] << 16) | (input_size[0] & 0xFFFF);
|
||||
vx_uint32 uniGetDXY_4x4[16] = {
|
||||
0x05050505, // TCfg
|
||||
0x04040404, // ASelt
|
||||
0x00100001, 0x00010010, // ABin
|
||||
0x09090909, // BSelt
|
||||
0x00010000, 0x00000001, // BBin
|
||||
0x00000101, // AccumType, ConstantType, and PostShift
|
||||
0x3c000000, 0x00000000, 0x3c000000, 0x00000000,
|
||||
0x3c000000, 0x00000000, 0x3c000000, 0x00000000 // Constant
|
||||
};
|
||||
vx_uint32 uniConvertF16toF32_4x4[16] = {
|
||||
0x01010101, // TCfg
|
||||
0x01010000, // ASelt
|
||||
0x00010000, 0x00010000, // ABin
|
||||
0x02020202, // BSelt
|
||||
0x00000000, 0x00000000, // BBin
|
||||
0x00000100, // AccumType, ConstantType, and PostShift
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
|
||||
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
|
||||
};
|
||||
|
||||
vxSetNodeUniform(nodObj, "uniGetDXY_4x4", 1, uniGetDXY_4x4);
|
||||
vxSetNodeUniform(nodObj, "uniConvertF16toF32_4x4", 1, uniConvertF16toF32_4x4);
|
||||
|
||||
//packedWH2[0] = input_size[0];
|
||||
//packedWH2[1] = input_size[1];
|
||||
//packedWH = (input_size[1] << 16) | (input_size[0] & 0xFFFF);
|
||||
vxSetNodeUniform(nodObj, "packedWH2", 1, packedWH2);
|
||||
vxSetNodeUniform(nodObj, "packedWH", 1, &packedWH);
|
||||
}
|
||||
if (output_size[2] > 1)
|
||||
{
|
||||
vxSetNodeUniform(nodObj, "depth", 1, &output_size[2]);
|
||||
}
|
||||
|
||||
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
|
||||
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
|
||||
|
||||
return VX_SUCCESS;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
vx_kernel_description_t vxSpatial_transformer_CPU =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
_VX_KERNEL_NAME,
|
||||
_VX_KERNEL_FUNC_KERNEL,
|
||||
s_params,
|
||||
_cnt_of_array( s_params ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxTransform_GemmKernelInfo_F16toF16 =
|
||||
{
|
||||
VX_KERNEL_ENUM_SPATIAL_TRANSFORMER,
|
||||
VX_KERNEL_NAME_SPATIAL_TRANSFORMER,
|
||||
NULL,
|
||||
vxTransform_GemmKernelParam,
|
||||
(sizeof(vxTransform_GemmKernelParam) / sizeof(vxTransform_GemmKernelParam[0])),
|
||||
vxValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxTransform_GemmInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxTransform_setupThresKernelInfo_F16toF16 =
|
||||
{
|
||||
VX_KERNEL_ENUM_SPATIAL_TRANSFORMER,
|
||||
VX_KERNEL_NAME_TRANSFORM_SETUP_THRES_F16TOF16,
|
||||
NULL,
|
||||
vxTransform_setupThresKernelParam,
|
||||
(sizeof(vxTransform_setupThresKernelParam) / sizeof(vxTransform_setupThresKernelParam[0])),
|
||||
vxValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxTransform_setupThresInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxTransform_InterPKernelInfo_F16toF16_2D =
|
||||
{
|
||||
VX_KERNEL_ENUM_SPATIAL_TRANSFORMER,
|
||||
VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16_2D,
|
||||
NULL,
|
||||
vxTransform_InterPKernelParam,
|
||||
(sizeof(vxTransform_InterPKernelParam) / sizeof(vxTransform_InterPKernelParam[0])),
|
||||
vxValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxTransform_InterPInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxTransform_InterPKernelInfo_F16toF16 =
|
||||
{
|
||||
VX_KERNEL_ENUM_SPATIAL_TRANSFORMER,
|
||||
VX_KERNEL_NAME_TRANSFORM_INTERP_F16TOF16,
|
||||
NULL,
|
||||
vxTransform_InterPKernelParam,
|
||||
(sizeof(vxTransform_InterPKernelParam) / sizeof(vxTransform_InterPKernelParam[0])),
|
||||
vxValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxTransform_InterPInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t * vx_kernel_SPATIAL_TRANSFORMER_list[] =
|
||||
{
|
||||
&vxSpatial_transformer_CPU,
|
||||
&vxTransform_setupThresKernelInfo_F16toF16,
|
||||
&vxTransform_GemmKernelInfo_F16toF16,
|
||||
&vxTransform_InterPKernelInfo_F16toF16_2D,
|
||||
&vxTransform_InterPKernelInfo_F16toF16,
|
||||
NULL
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,124 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
#include <stdlib.h>
|
||||
#include <VX/vx_khr_cnn.h>
|
||||
#include <VX/vx_helper.h>
|
||||
#include <VX/vx.h>
|
||||
#include <VX/vx_ext_program.h>
|
||||
|
||||
#include "vsi_nn_pub.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
#define _VX_KERNEL_VAR_CPU (vx_client_kernel_cpu_SYNC_HOST)
|
||||
#define _VX_KERNEL_ID KERNEL_ENUM_SYNC_HOST
|
||||
#define _VX_KERNEL_NAME ("com.vivantecorp.extension.Sync_hostVXC")
|
||||
#define _VX_KERNEL_FUNC_KERNEL (vxSync_hostKernel)
|
||||
|
||||
static vsi_status VX_CALLBACK vxSync_hostKernel
|
||||
(
|
||||
vx_node node,
|
||||
const vx_reference* paramObj,
|
||||
uint32_t paramNum
|
||||
)
|
||||
{
|
||||
vsi_status status = 0;
|
||||
vx_context context = NULL;
|
||||
vx_tensor input = NULL;
|
||||
vx_tensor output = NULL;
|
||||
uint8_t * in_buffer = NULL;
|
||||
uint32_t in_stride[8] = { 0 };
|
||||
vx_tensor_addressing in_addr = NULL;
|
||||
vsi_nn_tensor_attr_t in_attr;
|
||||
|
||||
status = VX_SUCCESS;
|
||||
context = vxGetContext( (vx_reference)node );
|
||||
input = (vx_tensor)paramObj[0];
|
||||
output = (vx_tensor)paramObj[1];
|
||||
memset(&in_attr, 0x0, sizeof(vsi_nn_tensor_attr_t));
|
||||
|
||||
in_buffer = vsi_nn_ConvertRawTensorToData2( context, input,
|
||||
&in_attr, in_stride, &in_addr, VX_READ_ONLY );
|
||||
|
||||
status = vsi_nn_vxCopyDataToTensor(context, output, &in_attr, in_buffer);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxCopyDataToTensor failure! at line %d\n", __LINE__);
|
||||
goto OnError;
|
||||
}
|
||||
|
||||
OnError:
|
||||
if( NULL != in_buffer )
|
||||
{
|
||||
free( in_buffer );
|
||||
}
|
||||
return status;
|
||||
} /* _VX_KERNEL_FUNC_KERNEL() */
|
||||
|
||||
static vx_param_description_t s_params[] =
|
||||
{
|
||||
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
|
||||
{ VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
|
||||
};
|
||||
|
||||
vx_status VX_CALLBACK vxSync_hostInitializer
|
||||
(
|
||||
vx_node nodObj,
|
||||
const vx_reference *paramObj,
|
||||
vx_uint32 paraNum
|
||||
)
|
||||
{
|
||||
vx_status status = VX_SUCCESS;
|
||||
/*TODO: Add initial code for VX program*/
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
vx_kernel_description_t _VX_KERNEL_VAR_CPU =
|
||||
{
|
||||
_VX_KERNEL_ID,
|
||||
_VX_KERNEL_NAME,
|
||||
_VX_KERNEL_FUNC_KERNEL,
|
||||
s_params,
|
||||
_cnt_of_array( s_params ),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t * vx_kernel_SYNC_HOST_list[] =
|
||||
{
|
||||
&_VX_KERNEL_VAR_CPU,
|
||||
NULL
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -1,287 +0,0 @@
|
|||
/****************************************************************************
|
||||
*
|
||||
* Copyright (c) 2020 Vivante Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
|
||||
#include "vsi_nn_platform.h"
|
||||
|
||||
#include "vsi_nn_prv.h"
|
||||
#include "vsi_nn_log.h"
|
||||
#include "vsi_nn_tensor_util.h"
|
||||
#include "utils/vsi_nn_util.h"
|
||||
#include "utils/vsi_nn_dtype_util.h"
|
||||
#include "utils/vsi_nn_math.h"
|
||||
#include "libnnext/vsi_nn_vxkernel.h"
|
||||
#include "libnnext/vx_lib_nnext.h"
|
||||
|
||||
void tensorStackConcatFunc
|
||||
(
|
||||
int16_t* dataIn,
|
||||
int16_t* dataIO,
|
||||
int32_t index,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
uint32_t channel,
|
||||
uint32_t batch
|
||||
)
|
||||
{
|
||||
int32_t stride = width * sizeof(int16_t);
|
||||
VSILOGI("Hello tensorStackConcatFunc!\n");
|
||||
memcpy(dataIO + index * width, dataIn, stride);
|
||||
return;
|
||||
}
|
||||
vsi_status VX_CALLBACK vxTensorStackConcatKernel
|
||||
(
|
||||
vx_node node,
|
||||
const vx_reference* paramObj,
|
||||
uint32_t paramNum
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_ERROR_INVALID_PARAMETERS;
|
||||
|
||||
if(paramNum == 3)
|
||||
{
|
||||
vx_context context = NULL;
|
||||
// tensor
|
||||
vx_tensor imgObj[2] = { NULL };
|
||||
vsi_nn_tensor_attr_t attr[2];
|
||||
int16_t *input = NULL, *output = NULL;
|
||||
uint32_t input_size[4] = {1, 1, 1, 1}, output_size[4] = {1, 1, 1, 1};
|
||||
uint32_t input_stride_size[4] = {1, 1, 1, 1};
|
||||
uint32_t output_stride_size[4] = {1, 1, 1, 1};
|
||||
vx_tensor_addressing input_user_addr = NULL;
|
||||
vx_tensor_addressing output_user_addr = NULL;
|
||||
vsi_nn_type_e inputFormat = VSI_NN_TYPE_FLOAT16, outputFormat = VSI_NN_TYPE_FLOAT16;
|
||||
uint32_t input_dims = 0, output_dims = 0;
|
||||
uint32_t i;
|
||||
// scalar
|
||||
vx_scalar scalar[1] = { NULL };
|
||||
int32_t index = 0;
|
||||
|
||||
status = VX_SUCCESS;
|
||||
imgObj[0] = (vx_tensor)paramObj[0];
|
||||
imgObj[1] = (vx_tensor)paramObj[1];
|
||||
scalar[0] = (vx_scalar)paramObj[2];
|
||||
memset(&attr[0], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
memset(&attr[1], 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
context = vxGetContext((vx_reference)node);
|
||||
if (context == NULL)
|
||||
{
|
||||
VSILOGE("vxGetContext failure! at line %d\n", __LINE__);
|
||||
return status;
|
||||
}
|
||||
|
||||
status = vsi_nn_vxGetTensorAttr(imgObj[0], &attr[0]);
|
||||
status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]);
|
||||
status |= vsi_nn_vxGetTensorAttr(imgObj[1], &attr[1]);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
|
||||
goto final;
|
||||
}
|
||||
|
||||
//input
|
||||
input_dims = attr[0].dim_num;
|
||||
inputFormat = attr[0].dtype.vx_type;
|
||||
for (i = 0; i < input_dims; i++)
|
||||
{
|
||||
input_size[i] = attr[0].size[i];
|
||||
}
|
||||
//output
|
||||
output_dims = attr[1].dim_num;
|
||||
outputFormat = attr[1].dtype.vx_type;
|
||||
for (i = 0; i < output_dims; i++)
|
||||
{
|
||||
output_size[i] = attr[1].size[i];
|
||||
}
|
||||
|
||||
input_size[2] = (input_dims <= 2)?1:input_size[2];
|
||||
input_size[3] = (input_dims <= 3)?1:input_size[3];
|
||||
input_stride_size[0] = vsi_nn_GetTypeBytes(inputFormat);
|
||||
for (i=1; i< input_dims; i++)
|
||||
{
|
||||
input_stride_size[i] = input_stride_size[i-1] * input_size[i-1];
|
||||
}
|
||||
input = (int16_t*)malloc(input_size[0]*input_size[1]*input_size[2]*sizeof(int16_t));
|
||||
input_user_addr = vxCreateTensorAddressing(context, input_size, input_stride_size, (vx_uint8)input_dims);
|
||||
vsi_nn_copy_tensor_patch(imgObj[0], &attr[0], input, VX_READ_ONLY);
|
||||
output_stride_size[0] = vsi_nn_GetTypeBytes(outputFormat);
|
||||
for (i=1; i< output_dims; i++)
|
||||
{
|
||||
output_stride_size[i] = output_stride_size[i-1] * output_size[i-1];
|
||||
}
|
||||
output = (int16_t*)malloc(output_size[0]*output_size[1]*output_size[2]*sizeof(int16_t));
|
||||
output_user_addr = vxCreateTensorAddressing(context, output_size,
|
||||
output_stride_size, (vx_uint8)output_dims);
|
||||
|
||||
vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], output, VX_READ_ONLY);
|
||||
// scalar
|
||||
status = vxCopyScalar(scalar[0], &index, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vxCopyScalar failure! at line %d\n", __LINE__);
|
||||
goto final;
|
||||
}
|
||||
// Call C Prototype
|
||||
tensorStackConcatFunc(input, output, index, input_size[0],
|
||||
input_size[1], input_size[2], input_size[3]);
|
||||
//output tensor
|
||||
vsi_nn_copy_tensor_patch(imgObj[1], &attr[1], output, VX_WRITE_ONLY);
|
||||
final:
|
||||
if(input) free(input);
|
||||
if(output) free(output);
|
||||
if(input_user_addr) vxReleaseTensorAddressing(&input_user_addr);
|
||||
if(output_user_addr) vxReleaseTensorAddressing(&output_user_addr);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
vsi_status VX_CALLBACK vxTensorStackConcatInitializer
|
||||
(
|
||||
vx_node nodObj,
|
||||
const vx_reference *paramObj,
|
||||
uint32_t paraNum
|
||||
)
|
||||
{
|
||||
vsi_status status = VX_SUCCESS;
|
||||
// Alignment with a power of two value.
|
||||
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
|
||||
vx_kernel_execution_parameters_t shaderParam = {
|
||||
3, // workdim
|
||||
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
|
||||
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
|
||||
{0, 0, 0}, // localWorkSize: local group size in thread
|
||||
{0, 0, 0}}; // globalWorkSize: image size in thread
|
||||
|
||||
vx_tensor input = (vx_tensor)paramObj[0];
|
||||
uint32_t input_size[4] = {1, 1, 1, 1};
|
||||
uint32_t input_dims = 0;
|
||||
vsi_nn_type_e inputDataFormat = VSI_NN_TYPE_FLOAT16;
|
||||
vsi_nn_tensor_attr_t attr;
|
||||
uint32_t i;
|
||||
|
||||
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
|
||||
status = vsi_nn_vxGetTensorAttr(input, &attr);
|
||||
if (status != VX_SUCCESS)
|
||||
{
|
||||
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
|
||||
return status;
|
||||
}
|
||||
|
||||
input_dims = attr.dim_num;
|
||||
inputDataFormat = attr.dtype.vx_type;
|
||||
for (i = 0; i < input_dims; i++)
|
||||
{
|
||||
input_size[i] = attr.size[i];
|
||||
}
|
||||
input_size[2] = (input_dims <= 2)?1:input_size[2];
|
||||
|
||||
shaderParam.globalWorkOffset[0] = 0;
|
||||
shaderParam.globalWorkOffset[1] = 0;
|
||||
shaderParam.globalWorkOffset[2] = 0;
|
||||
if (inputDataFormat == VSI_NN_TYPE_FLOAT16 || inputDataFormat == VSI_NN_TYPE_INT16)
|
||||
shaderParam.globalWorkScale[0] = 16;
|
||||
else
|
||||
shaderParam.globalWorkScale[0] = 32;
|
||||
shaderParam.globalWorkScale[1] = 1;
|
||||
shaderParam.globalWorkScale[2] = 1;
|
||||
shaderParam.globalWorkSize[0] = gcmALIGN((input_size[0] + shaderParam.globalWorkScale[0] - 1)
|
||||
/ shaderParam.globalWorkScale[0], 4);
|
||||
shaderParam.globalWorkSize[1] = (input_size[1] + shaderParam.globalWorkScale[1] - 1)
|
||||
/ shaderParam.globalWorkScale[1];
|
||||
shaderParam.globalWorkSize[2] = (input_size[2] + shaderParam.globalWorkScale[2] - 1)
|
||||
/ shaderParam.globalWorkScale[2];
|
||||
|
||||
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
|
||||
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
|
||||
if(status < 0)
|
||||
{
|
||||
VSILOGE("[%s : %d]Initializer failure! \n",__FILE__, __LINE__);
|
||||
}
|
||||
return status;
|
||||
}
|
||||
static vx_param_description_t vxTensorStackConcatKernelParam[] =
|
||||
{
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
|
||||
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED}
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
vx_kernel_description_t vxTensorStackConcatKernelInfo =
|
||||
{
|
||||
VX_KERNEL_ENUM_TENSORSTACKCONCAT,
|
||||
VX_KERNEL_NAME_TENSORSTACKCONCAT,
|
||||
NULL,
|
||||
vxTensorStackConcatKernelParam,
|
||||
(sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxTensorStackConcatInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxTensorStackConcatKernelInfo8Bits =
|
||||
{
|
||||
VX_KERNEL_ENUM_TENSORSTACKCONCAT8BITS,
|
||||
VX_KERNEL_NAME_TENSORSTACKCONCAT8BITS,
|
||||
NULL,
|
||||
vxTensorStackConcatKernelParam,
|
||||
(sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vxTensorStackConcatInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t vxTensorStackConcatKernelInfo_CPU =
|
||||
{
|
||||
VX_KERNEL_ENUM_TENSORSTACKCONCAT,
|
||||
VX_KERNEL_NAME_TENSORSTACKCONCAT,
|
||||
vxTensorStackConcatKernel,
|
||||
vxTensorStackConcatKernelParam,
|
||||
(sizeof(vxTensorStackConcatKernelParam) / sizeof(vxTensorStackConcatKernelParam[0])),
|
||||
vsi_nn_KernelValidator,
|
||||
NULL,
|
||||
NULL,
|
||||
vsi_nn_KernelInitializer,
|
||||
vsi_nn_KernelDeinitializer
|
||||
};
|
||||
|
||||
vx_kernel_description_t * vx_kernel_TENSORSTACKCONCAT_list[] =
|
||||
{
|
||||
&vxTensorStackConcatKernelInfo_CPU,
|
||||
&vxTensorStackConcatKernelInfo,
|
||||
&vxTensorStackConcatKernelInfo8Bits,
|
||||
NULL
|
||||
};
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
@ -30,24 +30,27 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1( \
|
|||
_viv_asm(COPY, mean, _mean, 16); \
|
||||
VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, var, _var, 16); \
|
||||
float4 gamma0 = read_imagef(Gamma, coord); \
|
||||
coord.x += 4; \
|
||||
float4 gamma1 = read_imagef(Gamma, coord); \
|
||||
coord.x -= 4; \
|
||||
float4 beta = read_imagef(Beta, coord); \
|
||||
int4 coord_in = coord; \
|
||||
int depth = get_image_array_size(Gamma); \
|
||||
_viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
|
||||
float4 gamma = read_imagef(Gamma, coord_in); \
|
||||
coord_in.z = coord.z; \
|
||||
depth = get_image_array_size(Beta); \
|
||||
_viv_asm(CLAMP0MAX, coord_in.z, coord_in.z, depth - 1); \
|
||||
float4 beta = read_imagef(Beta, coord_in); \
|
||||
\
|
||||
float4 src0, src1, m, v; \
|
||||
VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
|
||||
VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
|
||||
VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
|
||||
gamma0 = gamma0 * rsqrt(v + eps); \
|
||||
float4 gamma0 = gamma.xxxx * rsqrt(v + eps); \
|
||||
src0 = src0 * input_scale + input_tail; \
|
||||
src0 = (src0 - m) * gamma0 + beta.xxxx; \
|
||||
src0 = src0 * output_scale + output_zp; \
|
||||
VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
|
||||
VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
|
||||
VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
|
||||
gamma1 = gamma1 * rsqrt(v + eps); \
|
||||
float4 gamma1 = gamma.xxxx * rsqrt(v + eps); \
|
||||
src1 = src1 * input_scale + input_tail; \
|
||||
src1 = (src1 - m) * gamma1 + beta.xxxx; \
|
||||
src1 = src1 * output_scale + output_zp; \
|
||||
|
|
@ -95,22 +98,21 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1_2D( \
|
|||
_viv_asm(COPY, mean, _mean, 16); \
|
||||
VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, var, _var, 16); \
|
||||
float4 gamma0 = read_imagef(Gamma, coord.xy); \
|
||||
float4 gamma1 = read_imagef(Gamma, coord.zy); \
|
||||
float4 gamma = read_imagef(Gamma, coord.xy); \
|
||||
float4 beta = read_imagef(Beta, coord.xy); \
|
||||
\
|
||||
float4 src0, src1, m, v; \
|
||||
VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
|
||||
VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
|
||||
VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
|
||||
gamma0 = gamma0 * rsqrt(v + eps); \
|
||||
float4 gamma0 = gamma.xxxx * rsqrt(v + eps); \
|
||||
src0 = src0 * input_scale + input_tail; \
|
||||
src0 = (src0 - m) * gamma0 + beta.xxxx; \
|
||||
src0 = src0 * output_scale + output_zp; \
|
||||
VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
|
||||
VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
|
||||
VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
|
||||
gamma1 = gamma1 * rsqrt(v + eps); \
|
||||
float4 gamma1 = gamma.xxxx * rsqrt(v + eps); \
|
||||
src1 = src1 * input_scale + input_tail; \
|
||||
src1 = (src1 - m) * gamma1 + beta.xxxx; \
|
||||
src1 = src1 * output_scale + output_zp; \
|
||||
|
|
@ -158,12 +160,18 @@ __kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0( \
|
|||
_viv_asm(COPY, mean, _mean, 16); \
|
||||
VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, var, _var, 16); \
|
||||
float4 gamma0 = read_imagef(Gamma, coord); \
|
||||
float4 beta0 = read_imagef(Beta, coord); \
|
||||
coord.x += 4; \
|
||||
float4 gamma1 = read_imagef(Gamma, coord); \
|
||||
float4 beta1 = read_imagef(Beta, coord); \
|
||||
coord.x -= 4; \
|
||||
int4 coord_in0 = coord; \
|
||||
int depth = get_image_array_size(Gamma); \
|
||||
_viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, depth - 1); \
|
||||
float4 gamma0 = read_imagef(Gamma, coord_in0); \
|
||||
int4 coord_in1 = coord; \
|
||||
depth = get_image_array_size(Beta); \
|
||||
_viv_asm(CLAMP0MAX, coord_in1.z, coord_in1.z, depth - 1); \
|
||||
float4 beta0 = read_imagef(Beta, coord_in1); \
|
||||
coord_in0.x += 4; \
|
||||
coord_in1.x += 4; \
|
||||
float4 gamma1 = read_imagef(Gamma, coord_in0); \
|
||||
float4 beta1 = read_imagef(Beta, coord_in1); \
|
||||
\
|
||||
float4 src0, src1, m, v; \
|
||||
VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
|
||||
|
|
@ -264,4 +272,3 @@ BATCH_NORM_SH_IMPL_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16, int4, vxc_uchar
|
|||
BATCH_NORM_SH_IMPL_AXIS1_2D(U8, F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8, vxc_ushort8)
|
||||
BATCH_NORM_SH_IMPL_AXIS1_2D(I8, I8, vxc_char16, vxc_char16, int4, vxc_char16, vxc_char16)
|
||||
BATCH_NORM_SH_IMPL_AXIS1_2D(I8, F16, vxc_char16, vxc_char16, half4, vxc_half8, vxc_ushort8)
|
||||
|
||||
|
|
|
|||
|
|
@ -83,14 +83,6 @@ __kernel void conv1d_U8U8I32toU8_K1024_SMALL(
|
|||
VXC_WriteImage(output, coord.wy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
||||
inline uchar* get_image2D_array_ptr(image2d_array_t input)
|
||||
{
|
||||
int8 desc;
|
||||
_viv_asm(COPY, desc, input, sizeof(desc));
|
||||
uchar *src_ptr = (uchar*)desc.s0;
|
||||
return src_ptr;
|
||||
}
|
||||
|
||||
__kernel void conv1d_U8U8I32toU8_K1024_LARGE(
|
||||
__read_only image2d_array_t input,
|
||||
__read_only image2d_array_t weight,
|
||||
|
|
@ -112,9 +104,11 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(
|
|||
vxc_short8 w_zp = (short)weight_ZP;
|
||||
vxc_uchar16 input_val = 0, weight_val = 0;
|
||||
int temp = 0, i, j;
|
||||
uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input);
|
||||
Tensor src_tensor = create_image_from_image2d(input, 1);
|
||||
uchar *src_ptr_base = (uchar *)src_image.ptr;
|
||||
uchar *src_ptr;
|
||||
uchar *dst_ptr = (uchar *)get_image2D_array_ptr(output);
|
||||
Tensor dst_tensor = create_image_from_image2d(output, 1);
|
||||
uchar *dst_ptr = (uchar *)dst_tensor.ptr;
|
||||
|
||||
temp = read_imagei(bias, coord.yz).x;
|
||||
sum0 = convert_float(temp);
|
||||
|
|
@ -122,7 +116,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(
|
|||
|
||||
for (i = 0; i < input_height; i++)
|
||||
{
|
||||
src_ptr = src_ptr_base + (coord.x + coord.z * input_width);
|
||||
src_ptr = src_ptr_base + (coord.x + coord.z * src_image.stride_y);
|
||||
for (j = 0; j < kernel_cnt_x16; j++)
|
||||
{
|
||||
VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \
|
||||
|
|
@ -161,7 +155,7 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(
|
|||
_viv_asm(CONV_SAT_RTE, result1, sum1);
|
||||
vxc_uchar8 result;
|
||||
VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);
|
||||
dst_ptr = dst_ptr + (coord.w + coord.y * output_width);
|
||||
dst_ptr = dst_ptr + (coord.w + coord.y * dst_tensor.stride_y);
|
||||
VXC_Vstore8(dst_ptr, 0, result);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -72,6 +72,56 @@ float4 eltwise_unary_round(float4 x)
|
|||
return convert_float4(convert_int4_rte(x));
|
||||
}
|
||||
|
||||
#define MUL2_RSQRTPI (1.1283791670955126f)
|
||||
float erf_eval(float x)
|
||||
{
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1;
|
||||
float x_pow = x;
|
||||
float one = 1.0f;
|
||||
float n = 1;
|
||||
|
||||
if (x <= -3)
|
||||
return -1;
|
||||
else if(x >= 3)
|
||||
return 1;
|
||||
|
||||
while (fabs(tmp) > 1e-5)
|
||||
{
|
||||
res += tmp;
|
||||
|
||||
factorial *= n;
|
||||
one *= -1;
|
||||
x_pow *= x * x;
|
||||
tmp = one / factorial * x_pow / ( 2 * n + 1);
|
||||
|
||||
n += 1.0f;
|
||||
}
|
||||
return res * MUL2_RSQRTPI;
|
||||
}
|
||||
#define RSQRT2 (0.70710678118654752440084436210485f)
|
||||
float4 eltwise_unary_gelu(float4 x)
|
||||
{
|
||||
float4 erf, data;
|
||||
data = x * RSQRT2;
|
||||
erf.x = erf_eval(data.x);
|
||||
erf.y = erf_eval(data.y);
|
||||
erf.z = erf_eval(data.z);
|
||||
erf.w = erf_eval(data.w);
|
||||
x = 0.5f * x * (1 + erf);
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
#define SQRT_2_RCP_PI 0.7978845834732056f
|
||||
float4 eltwise_unary_hard_gelu(float4 x)
|
||||
{
|
||||
float4 cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *
|
||||
(x + 0.044715f * x * x * x));
|
||||
return x * cdf;
|
||||
}
|
||||
|
||||
_viv_uniform float inputScale;
|
||||
_viv_uniform float inputTail;
|
||||
_viv_uniform float outputScale;
|
||||
|
|
@ -203,6 +253,28 @@ ELTSISE_UNARY_2D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc
|
|||
ELTSISE_UNARY_2D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
|
||||
//GELU
|
||||
ELTSISE_UNARY_2D(gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)
|
||||
ELTSISE_UNARY_2D(gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)
|
||||
ELTSISE_UNARY_2D(gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)
|
||||
ELTSISE_UNARY_2D(gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)
|
||||
ELTSISE_UNARY_2D(gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
|
||||
//HARD_GELU
|
||||
ELTSISE_UNARY_2D(hard_gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(hard_gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)
|
||||
ELTSISE_UNARY_2D(hard_gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)
|
||||
ELTSISE_UNARY_2D(hard_gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(hard_gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)
|
||||
ELTSISE_UNARY_2D(hard_gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(hard_gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)
|
||||
ELTSISE_UNARY_2D(hard_gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_2D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||
|
|
@ -252,3 +324,7 @@ ELTSISE_UNARY_BF16_2D(mish)
|
|||
ELTSISE_UNARY_BF16_2D(hard_sigmoid)
|
||||
//ROUND
|
||||
ELTSISE_UNARY_BF16_2D(round)
|
||||
//GELU
|
||||
ELTSISE_UNARY_BF16_2D(gelu)
|
||||
//HARD_GELU
|
||||
ELTSISE_UNARY_BF16_2D(hard_gelu)
|
||||
|
|
|
|||
|
|
@ -72,6 +72,56 @@ float4 eltwise_unary_round(float4 x)
|
|||
return convert_float4(convert_int4_rte(x));
|
||||
}
|
||||
|
||||
#define MUL2_RSQRTPI (1.1283791670955126f)
|
||||
float erf_eval(float x)
|
||||
{
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1;
|
||||
float x_pow = x;
|
||||
float one = 1.0f;
|
||||
float n = 1;
|
||||
|
||||
if (x <= -3)
|
||||
return -1;
|
||||
else if(x >= 3)
|
||||
return 1;
|
||||
|
||||
while (fabs(tmp) > 1e-5)
|
||||
{
|
||||
res += tmp;
|
||||
|
||||
factorial *= n;
|
||||
one *= -1;
|
||||
x_pow *= x * x;
|
||||
tmp = one / factorial * x_pow / ( 2 * n + 1);
|
||||
|
||||
n += 1.0f;
|
||||
}
|
||||
return res * MUL2_RSQRTPI;
|
||||
}
|
||||
#define RSQRT2 (0.70710678118654752440084436210485f)
|
||||
float4 eltwise_unary_gelu(float4 x)
|
||||
{
|
||||
float4 erf, data;
|
||||
data = x * RSQRT2;
|
||||
erf.x = erf_eval(data.x);
|
||||
erf.y = erf_eval(data.y);
|
||||
erf.z = erf_eval(data.z);
|
||||
erf.w = erf_eval(data.w);
|
||||
x = 0.5f * x * (1 + erf);
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
#define SQRT_2_RCP_PI 0.7978845834732056f
|
||||
float4 eltwise_unary_hard_gelu(float4 x)
|
||||
{
|
||||
float4 cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *
|
||||
(x + 0.044715f * x * x * x));
|
||||
return x * cdf;
|
||||
}
|
||||
|
||||
_viv_uniform float inputScale;
|
||||
_viv_uniform float inputTail;
|
||||
_viv_uniform float outputScale;
|
||||
|
|
@ -203,6 +253,28 @@ ELTSISE_UNARY_3D(round, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc
|
|||
ELTSISE_UNARY_3D(round, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
|
||||
//GELU
|
||||
ELTSISE_UNARY_3D(gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)
|
||||
ELTSISE_UNARY_3D(gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)
|
||||
ELTSISE_UNARY_3D(gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)
|
||||
ELTSISE_UNARY_3D(gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)
|
||||
ELTSISE_UNARY_3D(gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
|
||||
//HARD_GELU
|
||||
ELTSISE_UNARY_3D(hard_gelu, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(hard_gelu, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)
|
||||
ELTSISE_UNARY_3D(hard_gelu, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)
|
||||
ELTSISE_UNARY_3D(hard_gelu, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(hard_gelu, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)
|
||||
ELTSISE_UNARY_3D(hard_gelu, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(hard_gelu, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)
|
||||
ELTSISE_UNARY_3D(hard_gelu, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(hard_gelu, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
|
||||
ELTSISE_UNARY_3D(hard_gelu, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
|
||||
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
|
||||
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
|
||||
|
|
@ -250,4 +322,8 @@ ELTSISE_UNARY_BF16(mish)
|
|||
//HARD_SIGMOID
|
||||
ELTSISE_UNARY_BF16(hard_sigmoid)
|
||||
//ROUND
|
||||
ELTSISE_UNARY_BF16(round)
|
||||
ELTSISE_UNARY_BF16(round)
|
||||
//GELU
|
||||
ELTSISE_UNARY_BF16(gelu)
|
||||
//HARD_GELU
|
||||
ELTSISE_UNARY_BF16(hard_gelu)
|
||||
|
|
@ -1,8 +1,9 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
#define MUL2_RSQRTPI (1.1283791670955126f)
|
||||
float eltwise_unary_erf(float x)
|
||||
float eltwise_unary_erf(float _x)
|
||||
{
|
||||
float x = clamp(_x, -2, 2);
|
||||
float res = 0;
|
||||
float tmp = x;
|
||||
float factorial = 1;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,185 @@
|
|||
#include "cl_viv_vx_ext.h"
|
||||
|
||||
_viv_uniform float4 theta_1;
|
||||
_viv_uniform float4 theta_2;
|
||||
_viv_uniform float4 scale;
|
||||
_viv_uniform float input_scale;
|
||||
_viv_uniform float input_tail;
|
||||
|
||||
#define GET_MATRIX_SH_IMPL(name0, in_type, read_func) \
|
||||
__kernel void get_matrix_##name0##toF32 \
|
||||
( \
|
||||
__read_only image2d_t input, \
|
||||
__write_only image2d_t output, \
|
||||
int has_theta_1_1, \
|
||||
int has_theta_1_2, \
|
||||
int has_theta_1_3, \
|
||||
int has_theta_2_1, \
|
||||
int has_theta_2_2, \
|
||||
int has_theta_2_3, \
|
||||
float theta_1_1, \
|
||||
float theta_1_2, \
|
||||
float theta_1_3, \
|
||||
float theta_2_1, \
|
||||
float theta_2_2, \
|
||||
float theta_2_3, \
|
||||
float i_width, \
|
||||
float i_height, \
|
||||
float o_width, \
|
||||
float o_height \
|
||||
) \
|
||||
{ \
|
||||
int2 coord = (int2)(0, get_global_id(1)); \
|
||||
float4 matrix0, matrix1; \
|
||||
float4 theta1, theta2; \
|
||||
_viv_asm(COPY, theta1, theta_1, 16); \
|
||||
_viv_asm(COPY, theta2, theta_2, 16); \
|
||||
\
|
||||
if (has_theta_1_1 == 0) \
|
||||
{ \
|
||||
in_type data = read_func(input, coord); \
|
||||
coord.x ++; \
|
||||
theta1.x = convert_float(data.x) * input_scale + input_tail; \
|
||||
} \
|
||||
\
|
||||
if (has_theta_1_2 == 0) \
|
||||
{ \
|
||||
in_type data = read_func(input, coord); \
|
||||
coord.x ++; \
|
||||
theta1.y = convert_float(data.x) * input_scale + input_tail; \
|
||||
} \
|
||||
\
|
||||
if (has_theta_1_3 == 0) \
|
||||
{ \
|
||||
in_type data = read_func(input, coord); \
|
||||
coord.x ++; \
|
||||
theta1.z = convert_float(data.x) * input_scale + input_tail; \
|
||||
} \
|
||||
\
|
||||
if (has_theta_2_1 == 0) \
|
||||
{ \
|
||||
in_type data = read_func(input, coord); \
|
||||
coord.x ++; \
|
||||
theta2.x = convert_float(data.x) * input_scale + input_tail; \
|
||||
} \
|
||||
\
|
||||
if (has_theta_2_2 == 0) \
|
||||
{ \
|
||||
in_type data = read_func(input, coord); \
|
||||
coord.x ++; \
|
||||
theta2.y = convert_float(data.x) * input_scale + input_tail; \
|
||||
} \
|
||||
\
|
||||
if (has_theta_2_3 == 0) \
|
||||
{ \
|
||||
in_type data = read_func(input, coord); \
|
||||
coord.x ++; \
|
||||
theta2.z = convert_float(data.x) * input_scale + input_tail; \
|
||||
} \
|
||||
\
|
||||
matrix0.x = theta2.y * scale.x; \
|
||||
matrix0.z = theta2.x * scale.z; \
|
||||
matrix1.x = ( theta2.z - theta2.y - theta2.x + 1) * i_width * 0.5f; \
|
||||
matrix0.y = theta1.y * scale.w; \
|
||||
matrix0.w = theta1.x * scale.y; \
|
||||
matrix1.y = ( theta1.z - theta1.y - theta1.x + 1) * i_height * 0.5f; \
|
||||
matrix1.zw = 2.0f * matrix0.xy; \
|
||||
\
|
||||
coord.x = 0; \
|
||||
vxc_ushort8 dst; \
|
||||
_viv_asm(COPY, dst, matrix0, 16); \
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
_viv_asm(COPY, dst, matrix1, 16); \
|
||||
coord.x = 8; \
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
|
||||
}
|
||||
GET_MATRIX_SH_IMPL(I16, int4, read_imagei)
|
||||
GET_MATRIX_SH_IMPL(I8, int4, read_imagei)
|
||||
GET_MATRIX_SH_IMPL(U8, uint4, read_imageui)
|
||||
|
||||
__kernel void get_matrix_F16toF32
|
||||
(
|
||||
__read_only image2d_t input,
|
||||
__write_only image2d_t output,
|
||||
int has_theta_1_1,
|
||||
int has_theta_1_2,
|
||||
int has_theta_1_3,
|
||||
int has_theta_2_1,
|
||||
int has_theta_2_2,
|
||||
int has_theta_2_3,
|
||||
float theta_1_1,
|
||||
float theta_1_2,
|
||||
float theta_1_3,
|
||||
float theta_2_1,
|
||||
float theta_2_2,
|
||||
float theta_2_3,
|
||||
float i_width,
|
||||
float i_height,
|
||||
float o_width,
|
||||
float o_height
|
||||
)
|
||||
{
|
||||
int2 coord = (int2)(0, get_global_id(1));
|
||||
float4 matrix0, matrix1;
|
||||
float4 theta1, theta2;
|
||||
_viv_asm(COPY, theta1, theta_1, 16);
|
||||
_viv_asm(COPY, theta2, theta_2, 16);
|
||||
|
||||
if (has_theta_1_1 == 0)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord.x ++;
|
||||
theta1.x = data.x;
|
||||
}
|
||||
|
||||
if (has_theta_1_2 == 0)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord.x ++;
|
||||
theta1.y = data.x;
|
||||
}
|
||||
|
||||
if (has_theta_1_3 == 0)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord.x ++;
|
||||
theta1.z = data.x;
|
||||
}
|
||||
|
||||
if (has_theta_2_1 == 0)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord.x ++;
|
||||
theta2.x = data.x;
|
||||
}
|
||||
|
||||
if (has_theta_2_2 == 0)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord.x ++;
|
||||
theta2.y = data.x;
|
||||
}
|
||||
|
||||
if (has_theta_2_3 == 0)
|
||||
{
|
||||
float4 data = read_imagef(input, coord);
|
||||
coord.x ++;
|
||||
theta2.z = data.x;
|
||||
}
|
||||
|
||||
matrix0.x = theta2.y * scale.x;
|
||||
matrix0.z = theta2.x * scale.z;
|
||||
matrix1.x = ( theta2.z - theta2.y - theta2.x + 1) * i_width * 0.5f;
|
||||
matrix0.y = theta1.y * scale.w;
|
||||
matrix0.w = theta1.x * scale.y;
|
||||
matrix1.y = ( theta1.z - theta1.y - theta1.x + 1) * i_height * 0.5f;
|
||||
matrix1.zw = 2.0f * matrix0.xy;
|
||||
|
||||
coord.x = 0;
|
||||
vxc_ushort8 dst;
|
||||
_viv_asm(COPY, dst, matrix0, 16);
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, dst, matrix1, 16);
|
||||
coord.x = 8;
|
||||
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
}
|
||||
|
|
@ -16,7 +16,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
|
|||
int gidx = get_global_id(0) << 3;
|
||||
int lidx = get_local_id(0);
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(gidx, 0, gidz, 0);
|
||||
int4 coord = (int4)(gidx, 0, gidz, gidz);
|
||||
vxc_short8 src0;
|
||||
vxc_half8 in_h;
|
||||
vxc_float4 sumsqr;
|
||||
|
|
@ -133,7 +133,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
|
|||
image2d_array_t output, float eps, int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_para = (int4)(gidz, 0, 0, 0);
|
||||
vxc_short8 src0;
|
||||
vxc_short8 src1;
|
||||
|
|
@ -166,18 +167,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
|
|||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, in_h, src0, 16);
|
||||
|
||||
coord_in.y ++;
|
||||
|
||||
VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
UniFP16toFP32Lo4_dp4x4);
|
||||
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -191,7 +194,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
|
|||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
|
||||
uniConvertHalfToFp16_2x8);
|
||||
_viv_asm(COPY, outval, dst, 16);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
|
|||
int gidx = get_global_id(0) << 3;
|
||||
int lidx = get_local_id(0);
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(gidx, 0, gidz, 0);
|
||||
int4 coord = (int4)(gidx, 0, gidz, gidz);
|
||||
vxc_short8 src0;
|
||||
float sum = 0, sqr = 0;
|
||||
vxc_float4 sumsqr = (vxc_float4)(0);
|
||||
|
|
@ -43,7 +43,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
|
|||
{
|
||||
for(coord.y = 0; coord.y < height;)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord.y++;
|
||||
VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -106,7 +106,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
|
|||
{
|
||||
for(; coord.y < endH;)
|
||||
{
|
||||
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_ReadImage(src0, input, coord, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord.y++;
|
||||
VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -154,7 +154,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_para = (int4)(gidz, 0, 0, 0);
|
||||
vxc_short8 src0;
|
||||
vxc_short8 src1;
|
||||
|
|
@ -162,7 +163,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
float scale_vari, bias_val;
|
||||
vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
|
||||
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, 0,\
|
||||
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, scale_h, src1, 16);
|
||||
VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -190,16 +191,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
uniConvertInt16Fp32Fst_4x4);
|
||||
VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -213,7 +215,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
|
||||
uniConvertHalfToFp16_2x8);
|
||||
_viv_asm(COPY, outval, dst, 16);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -238,7 +240,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
float scale_vari, bias_val;
|
||||
vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
|
||||
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, 0,\
|
||||
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, scale_h, src1, 16);
|
||||
VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -265,7 +267,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
|
||||
for(; coord.y < endH; coord.y++)
|
||||
{
|
||||
VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src0, input, coord.xy, 0,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -294,7 +296,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_para = (int4)(gidz, 0, 0, 0);
|
||||
vxc_short8 src0, src2;
|
||||
vxc_short8 src1;
|
||||
|
|
@ -302,7 +305,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
float scale_vari, bias_val;
|
||||
vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
|
||||
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, 0,\
|
||||
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, scale_h, src1, 16);
|
||||
VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
|
||||
|
|
@ -326,15 +329,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
uniConvertInt16Fp32Fst_4x4);
|
||||
VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -346,7 +350,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
tmpVal1 = convert_int4_rte(norm);
|
||||
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
|
||||
uniConvertInt32toInt16_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -371,7 +375,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
float scale_vari, bias_val;
|
||||
vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
|
||||
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, 0,\
|
||||
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, scale_h, src1, 16);
|
||||
VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
|
||||
|
|
@ -394,7 +398,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
|
|||
|
||||
for(; coord.y < endH; coord.y++)
|
||||
{
|
||||
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src0, input, coord, 0,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
uniConvertInt16Fp32Fst_4x4);
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
|
|||
int gidx = get_global_id(0) << 4;
|
||||
int lidx = get_local_id(0);
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(gidx, 0, gidz, 0);
|
||||
int4 coord = (int4)(gidx, 0, gidz, gidz);
|
||||
vxc_char16 src0;
|
||||
float sum = 0, sqr = 0;
|
||||
int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
|
||||
|
|
@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
|
|||
image2d_array_t output, float eps, int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_para = (int4)(gidz, 0, 0, 0);
|
||||
vxc_char16 src0;
|
||||
vxc_short8 src1, outval;
|
||||
|
|
@ -277,7 +277,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
|
|||
image2d_array_t output, float eps, int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_para = (int4)(gidz, 0, 0, 0);
|
||||
vxc_char16 src0, src2;
|
||||
vxc_short8 src1;
|
||||
|
|
@ -309,16 +310,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
|
|||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
|
||||
VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
|
||||
VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
|
||||
|
|
@ -333,7 +335,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
|
|||
norm = tmpData3 * alpha + bias_val;
|
||||
tmpVal1 = convert_int4_rte(norm);
|
||||
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,8 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \
|
|||
image2d_array_t output, float eps, int rsFlg) \
|
||||
{ \
|
||||
int gidz = get_global_id(1); \
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0); \
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
|
||||
int2 coord_para = (int2)(gidz, 0); \
|
||||
read_type src0, src2; \
|
||||
float scale_vari, bias_val; \
|
||||
|
|
@ -60,15 +61,16 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \
|
|||
int8 input_desc, output_desc; \
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
|
||||
int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
|
||||
_viv_asm(MOV, coord.z, baseAddr_a); \
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a); \
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
|
||||
int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \
|
||||
_viv_asm(MOV, coord.w, baseAddr); \
|
||||
_viv_asm(MOV, coord.z, baseAddr); \
|
||||
\
|
||||
for(coord.y = 0; coord.y < height; coord.y++) \
|
||||
{ \
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
|
||||
coord_in.y ++; \
|
||||
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
|
||||
uniConvert1stUint8SubZpToFp32_4x4); \
|
||||
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
|
||||
|
|
@ -87,7 +89,7 @@ __kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \
|
|||
norm = tmpData3 * alpha + bias_val; \
|
||||
tmpVal1 = convert_int4_rte(norm); \
|
||||
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
|
||||
} \
|
||||
}
|
||||
INSTANCENORM_8BITS_F32(U8, vxc_uchar16)
|
||||
|
|
@ -166,7 +168,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F
|
|||
image2d_array_t output, float eps, int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int2 coord_para = (int2)(gidz, 0);
|
||||
vxc_short8 src0, src2;
|
||||
float scale_vari, bias_val;
|
||||
|
|
@ -201,15 +204,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F
|
|||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
uniConvertInt16Fp32Fst_4x4);
|
||||
VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -221,7 +225,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F
|
|||
tmpVal1 = convert_int4_rte(norm);
|
||||
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
|
||||
uniConvertInt32toInt16_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
|
|||
int gidx = get_global_id(0) << 3;
|
||||
int lidx = get_local_id(0);
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(gidx, 0, gidz, 0);
|
||||
int4 coord = (int4)(gidx, 0, gidz, gidz);
|
||||
vxc_short8 src0, src1, src2;
|
||||
float4 srcA, srcB;
|
||||
vxc_float sum = 0, sqr = 0;
|
||||
|
|
@ -134,7 +134,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
|
|||
image2d_array_t output, float eps, int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
vxc_short8 src0, src1, src2;
|
||||
float scale_vari, bias_val;
|
||||
vxc_float4 mean_vari = (vxc_float4)(0);
|
||||
|
|
@ -144,7 +145,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
|
|||
Image img3 = create_image_from_image2d(meanVari, 4);
|
||||
__global float* bias_ptr = (__global float*)img1.ptr;
|
||||
__global float* scal_ptr = (__global float*)img2.ptr;
|
||||
__global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);
|
||||
__global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));
|
||||
__global float4* vari_ptr = (__global float4*)sumVari_ptr;
|
||||
|
||||
float bval = bias_ptr[gidz];
|
||||
|
|
@ -166,16 +167,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
|
|||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
uniConvBF16toF32_Part0_2x8);
|
||||
VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
|
||||
|
|
@ -189,7 +191,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
|
|||
norm = scale_vari * tmpData1 + bias_val;
|
||||
_viv_asm(COPY, src1, norm, 16);
|
||||
VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -13,7 +13,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F
|
|||
image2d_array_t output, float eps, int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
vxc_short8 src0;
|
||||
vxc_half8 in_h;
|
||||
float scale_vari, bias_val;
|
||||
|
|
@ -24,7 +25,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F
|
|||
Image img3 = create_image_from_image2d(meanVari, 4);
|
||||
__global float* bias_ptr = (__global float*)img1.ptr;
|
||||
__global float* scal_ptr = (__global float*)img2.ptr;
|
||||
__global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);
|
||||
__global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));
|
||||
__global float4* vari_ptr = (__global float4*)sumVari_ptr;
|
||||
|
||||
float bval = bias_ptr[gidz];
|
||||
|
|
@ -49,18 +50,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F
|
|||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, in_h, src0, 16);
|
||||
|
||||
coord_in.y ++;
|
||||
|
||||
VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
UniFP16toFP32Lo4_dp4x4);
|
||||
VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -74,7 +77,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F
|
|||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
|
||||
uniConvertHalfToFp16_2x8);
|
||||
_viv_asm(COPY, outval, dst, 16);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
|
|||
int gidx = get_global_id(0) << 4;
|
||||
int lidx = get_local_id(0);
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(gidx, 0, gidz, 0);
|
||||
int4 coord = (int4)(gidx, 0, gidz, gidz);
|
||||
vxc_uchar16 src0;
|
||||
float sum = 0, sqr = 0;
|
||||
int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;
|
||||
|
|
@ -44,7 +44,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
|
|||
{
|
||||
for(coord.y = 0; coord.y < height;)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, 0, \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord.y++;
|
||||
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
|
||||
|
|
@ -96,7 +96,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
|
|||
{
|
||||
for(; coord.y < endH;)
|
||||
{
|
||||
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_ReadImage(src0, input, coord, 0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord.y++;
|
||||
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
|
||||
|
|
@ -133,7 +133,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
|
|||
image2d_array_t output, float eps, int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_para = (int4)(gidz, 0, 0, 0);
|
||||
vxc_uchar16 src0, src2;
|
||||
vxc_short8 src1;
|
||||
|
|
@ -141,7 +142,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
|
|||
float scale_vari, bias_val;
|
||||
vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
|
||||
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, 0,\
|
||||
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, scale_h, src1, 16);
|
||||
VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
|
||||
|
|
@ -166,15 +167,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
|
|||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, 0, \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
|
||||
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
|
||||
VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
|
||||
|
|
@ -189,7 +191,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
|
|||
norm = tmpData3 * alpha + bias_val;
|
||||
tmpVal1 = convert_int4_rte(norm);
|
||||
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -208,7 +210,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
|
|||
float scale_vari, bias_val;
|
||||
vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
|
||||
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src1, scale, coord_para.xy, 0,\
|
||||
VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
|
||||
_viv_asm(COPY, scale_h, src1, 16);
|
||||
VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
|
||||
|
|
@ -232,7 +234,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
|
|||
|
||||
for(; coord.y < endH; coord.y++)
|
||||
{
|
||||
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
|
||||
VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
|
||||
VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
|
|||
image2d_array_t output, float eps, int rsFlg)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_para = (int4)(gidz, 0, 0, 0);
|
||||
vxc_uchar16 src0;
|
||||
vxc_short8 src1, outval;
|
||||
|
|
|
|||
|
|
@ -6,14 +6,6 @@ do \
|
|||
VXC_OP3_NoDest(vstore3, Pointer, byteOffset, Data); } \
|
||||
while(0)
|
||||
|
||||
inline uchar* get_image2D_array_ptr(image2d_array_t input)
|
||||
{
|
||||
int8 desc;
|
||||
_viv_asm(COPY, desc, input, sizeof(desc));
|
||||
uchar *src_ptr = (uchar*)desc.s0;
|
||||
return src_ptr;
|
||||
}
|
||||
|
||||
#define L2NORMSCALE_SWITCH_PROCESS(case_value, vec_val, ZpValue) \
|
||||
switch (case_value) \
|
||||
{ \
|
||||
|
|
@ -104,8 +96,10 @@ _viv_uniform int inputZP;
|
|||
|
||||
#define L2NORMSCALE_MUL_AXIS0_PROCESS(dst_type, convert_type, output_type, copy_type) \
|
||||
vxc_float4 rsqrt0;\
|
||||
dst_type *dst_ptr = (dst_type *)get_image2D_array_ptr(output); \
|
||||
short *scale_ptr = (short *)get_image2D_array_ptr(scale); \
|
||||
Image dst_img = create_image_from_image2d(output, 1); \
|
||||
dst_type *dst_ptr = (dst_type *)dst_img.ptr; \
|
||||
Image s_img = create_image_from_image2d(scale, 2); \
|
||||
short *scale_ptr = (short *)s_img.ptr; \
|
||||
vxc_float4 vec0, vec1;\
|
||||
convert_type dst0, dst1;\
|
||||
vxc_short8 scale_s16;\
|
||||
|
|
@ -188,15 +182,16 @@ _viv_uniform int inputZP;
|
|||
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
|
||||
void l2normalizescale_axis0_##in0_name##_##in1_name##to##out_name##_2D \
|
||||
(\
|
||||
__read_only image2d_array_t input,\
|
||||
__read_only image2d_array_t scale,\
|
||||
__write_only image2d_array_t output,\
|
||||
__read_only image2d_t input,\
|
||||
__read_only image2d_t scale,\
|
||||
__write_only image2d_t output,\
|
||||
int axis\
|
||||
)\
|
||||
{ \
|
||||
int lidx = get_local_id(0); \
|
||||
int offset = get_global_id(0); \
|
||||
read_type *src_ptr_base = (read_type *)get_image2D_array_ptr(input); \
|
||||
Image src_img = create_image_from_image2d(input, 1); \
|
||||
read_type *src_ptr_base = (read_type *)src_img.ptr; \
|
||||
read_type *src_ptr; \
|
||||
read_type2 src0, src1; \
|
||||
src_type val0, val1; \
|
||||
|
|
@ -267,7 +262,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
|
|||
{ \
|
||||
int lidx = get_local_id(0); \
|
||||
int offset = get_global_id(0); \
|
||||
uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input); \
|
||||
Image src_img = create_image_from_image2d(input, 1);
|
||||
uchar *src_ptr_base = (uchar *)src_img.ptr; \
|
||||
uchar *src_ptr; \
|
||||
vxc_uchar8 src0, src1; \
|
||||
vxc_uchar8 val0, val1; \
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ __kernel void layer_norm_F16toF16(
|
|||
image2d_array_t input, image2d_t bias, image2d_t scale,
|
||||
image2d_array_t output, float eps)
|
||||
{
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_out = coord;
|
||||
|
||||
int8 input_desc, output_desc;
|
||||
|
|
@ -21,18 +21,18 @@ __kernel void layer_norm_F16toF16(
|
|||
|
||||
vxc_short8 src0, src1;
|
||||
vxc_float sum = 0, sqr = 0;
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
_viv_asm(MOV, coord_out.z, baseAddr);
|
||||
|
||||
for(coord.x = 8; coord.x < (width+8); coord.x += 8)
|
||||
{
|
||||
vxc_half8 val0_h;
|
||||
_viv_asm(COPY, val0_h, src0, 16);
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
vxc_float4 sumsqr;
|
||||
VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -49,7 +49,7 @@ __kernel void layer_norm_F16toF16(
|
|||
vxc_float4 bias_f;
|
||||
for(coord.x = 0; coord.x < width; coord.x += 4)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
|
@ -73,7 +73,7 @@ __kernel void layer_norm_F16toF16(
|
|||
vxc_short8 dstval;
|
||||
_viv_asm(COPY, dstval, dst, 16);
|
||||
coord_out.x = coord.x;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \
|
||||
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -99,7 +99,7 @@ __kernel void layer_norm_U8toU8(
|
|||
image2d_array_t input, image2d_t bias, image2d_t scale,
|
||||
image2d_array_t output, float eps)
|
||||
{
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_out = coord;
|
||||
|
||||
vxc_uchar16 src0, src2;
|
||||
|
|
@ -119,11 +119,11 @@ __kernel void layer_norm_U8toU8(
|
|||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
_viv_asm(MOV, coord_out.z, baseAddr);
|
||||
|
||||
for(coord.x = 0; coord.x < width; coord.x += 16)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
|
||||
tmpSum += (tmpSum1.x);
|
||||
|
|
@ -144,7 +144,7 @@ __kernel void layer_norm_U8toU8(
|
|||
|
||||
for(coord.x = 0; coord.x < width; coord.x += 16)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
|
@ -203,7 +203,7 @@ __kernel void layer_norm_U8toU8(
|
|||
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
|
||||
uniConvertInt32toUint8_2x8);
|
||||
coord_out.x = coord.x;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \
|
||||
VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -212,7 +212,7 @@ __kernel void layer_norm_F16toU8(
|
|||
image2d_array_t input, image2d_t bias, image2d_t scale,
|
||||
image2d_array_t output, float eps)
|
||||
{
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_out = coord;
|
||||
|
||||
int8 input_desc, output_desc;
|
||||
|
|
@ -222,18 +222,18 @@ __kernel void layer_norm_F16toU8(
|
|||
|
||||
vxc_short8 src0, src1;
|
||||
vxc_float sum = 0, sqr = 0;
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
_viv_asm(MOV, coord_out.z, baseAddr);
|
||||
|
||||
for(coord.x = 8; coord.x < (width+8); coord.x += 8)
|
||||
{
|
||||
vxc_half8 val0_h;
|
||||
_viv_asm(COPY, val0_h, src0, 16);
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
vxc_float4 sumsqr;
|
||||
VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -250,7 +250,7 @@ __kernel void layer_norm_F16toU8(
|
|||
vxc_float4 bias_f;
|
||||
for(coord.x = 0; coord.x < width; coord.x += 4)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
|
|
@ -273,7 +273,7 @@ __kernel void layer_norm_F16toU8(
|
|||
VXC_DP2x8(dst, output_int4, output_int4, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),
|
||||
uniConvertInt32toUint8_2x8);
|
||||
coord_out.x = coord.x;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out, dst, \
|
||||
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -21,24 +21,25 @@ __kernel void layer_norm_I16toI16(
|
|||
image2d_array_t input, image2d_t bias, image2d_t scale,
|
||||
image2d_array_t output, float eps)
|
||||
{
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
|
||||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
|
||||
vxc_short8 src0, src1, dst;
|
||||
vxc_float sum = 0, sqr = 0;
|
||||
for(; coord.x < width;)
|
||||
for(; coord_in.x < width;)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x += 8;
|
||||
coord_in.x += 8;
|
||||
vxc_float4 sumsqr;
|
||||
VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
|
||||
uniInt16SumSqr_dp8x2);
|
||||
|
|
@ -60,11 +61,11 @@ __kernel void layer_norm_I16toI16(
|
|||
|
||||
int2 coord_bias = (int2)(0, 0);
|
||||
|
||||
for(coord.x = 0; coord.x < width; coord.x += 8)
|
||||
for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_bias.x = coord.x;
|
||||
coord_bias.x = coord_in.x;
|
||||
VXC_ReadImage(src1, scale, coord_bias, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = read_imagef(bias, coord_bias);
|
||||
|
|
@ -92,7 +93,7 @@ __kernel void layer_norm_I16toI16(
|
|||
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
|
||||
uniConvertInt32toUint8_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, dst, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ __kernel void layer_norm_F16F32toF16(
|
|||
image2d_array_t input, image2d_t bias, image2d_t scale,
|
||||
image2d_array_t output, float eps)
|
||||
{
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_out = coord;
|
||||
|
||||
int8 input_desc, output_desc;
|
||||
|
|
@ -21,20 +21,20 @@ __kernel void layer_norm_F16F32toF16(
|
|||
|
||||
vxc_short8 src0;
|
||||
vxc_float sum = 0, sqr = 0;
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
Image img1 = create_image_from_image2d(bias, 4);
|
||||
Image img2 = create_image_from_image2d(scale, 4);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
_viv_asm(MOV, coord_out.z, baseAddr);
|
||||
|
||||
for(coord.x = 8; coord.x < (width+8); coord.x += 8)
|
||||
{
|
||||
vxc_half8 val0_h;
|
||||
_viv_asm(COPY, val0_h, src0, 16);
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
vxc_float4 sumsqr;
|
||||
VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
|
||||
|
|
@ -49,11 +49,11 @@ __kernel void layer_norm_F16F32toF16(
|
|||
vari += eps;
|
||||
vari = rsqrt(vari);
|
||||
vxc_float4 bias_f, scale_f, in_f;
|
||||
__global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);
|
||||
__global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);
|
||||
__global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));
|
||||
__global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));
|
||||
for(coord.x = 0; coord.x < width; coord.x += 4)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f = vload4(0, bias_ptr + coord.x);
|
||||
scale_f = vload4(0, scale_ptr + coord.x);
|
||||
|
|
@ -72,7 +72,7 @@ __kernel void layer_norm_F16F32toF16(
|
|||
vxc_short8 dstval;
|
||||
_viv_asm(COPY, dstval, dst, 16);
|
||||
coord_out.x = coord.x;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out, dstval, \
|
||||
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -100,7 +100,7 @@ __kernel void layer_norm_U8F32toU8(
|
|||
image2d_array_t input, image2d_t bias, image2d_t scale,
|
||||
image2d_array_t output, float eps)
|
||||
{
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_out = coord;
|
||||
|
||||
vxc_uchar16 src0, src2;
|
||||
|
|
@ -118,11 +118,11 @@ __kernel void layer_norm_U8F32toU8(
|
|||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
_viv_asm(MOV, coord_out.z, baseAddr);
|
||||
|
||||
for(coord.x = 0; coord.x < width; coord.x += 16)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
|
||||
tmpSum += (tmpSum1.x);
|
||||
|
|
@ -142,11 +142,11 @@ __kernel void layer_norm_U8F32toU8(
|
|||
|
||||
Image img1 = create_image_from_image2d(bias, 4);
|
||||
Image img2 = create_image_from_image2d(scale, 4);
|
||||
__global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);
|
||||
__global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);
|
||||
__global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));
|
||||
__global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));
|
||||
for(coord.x = 0; coord.x < width; coord.x += 16)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = vload4(0, bias_ptr);
|
||||
bias_f1 = vload4(1, bias_ptr);
|
||||
|
|
@ -193,7 +193,7 @@ __kernel void layer_norm_U8F32toU8(
|
|||
VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
|
||||
uniConvertInt32toUint8_2x8);
|
||||
coord_out.x = coord.x;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \
|
||||
VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -202,24 +202,25 @@ __kernel void layer_norm_I16F32toI16(
|
|||
image2d_array_t input, image2d_t bias, image2d_t scale,
|
||||
image2d_array_t output, float eps)
|
||||
{
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_in = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
|
||||
int8 input_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.z, baseAddr_a);
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr);
|
||||
_viv_asm(MOV, coord.z, baseAddr);
|
||||
|
||||
vxc_short8 src0, dst;
|
||||
vxc_float sum = 0, sqr = 0;
|
||||
for(; coord.x < width;)
|
||||
for(; coord_in.x < width;)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord.x += 8;
|
||||
coord_in.x += 8;
|
||||
vxc_float4 sumsqr;
|
||||
VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
|
||||
uniInt16SumSqr_dp8x2);
|
||||
|
|
@ -243,9 +244,9 @@ __kernel void layer_norm_I16F32toI16(
|
|||
Image img2 = create_image_from_image2d(scale, 4);
|
||||
__global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias);
|
||||
__global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias);
|
||||
for(coord.x = 0; coord.x < width; coord.x += 8)
|
||||
for(coord_in.x = 0; coord_in.x < width; coord_in.x += 8, coord.x += 8)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = vload4(0, bias_ptr);
|
||||
bias_f1 = vload4(1, bias_ptr);
|
||||
|
|
@ -269,7 +270,7 @@ __kernel void layer_norm_I16F32toI16(
|
|||
|
||||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
|
||||
uniConvertInt32toUint8_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, dst, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -11,7 +11,7 @@ __kernel void layer_norm_BF16F32toBF16(
|
|||
image2d_array_t input, image2d_t bias, image2d_t scale,
|
||||
image2d_array_t output, float eps)
|
||||
{
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_out = coord;
|
||||
|
||||
int8 input_desc, output_desc;
|
||||
|
|
@ -30,7 +30,7 @@ __kernel void layer_norm_BF16F32toBF16(
|
|||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
_viv_asm(MOV, coord_out.z, baseAddr);
|
||||
float4 srcA, srcB;
|
||||
for(coord.x = 8; coord.x < (width+8); coord.x += 8)
|
||||
{
|
||||
|
|
@ -40,7 +40,7 @@ __kernel void layer_norm_BF16F32toBF16(
|
|||
uniConvBF16toF32_Part1_2x8);
|
||||
_viv_asm(COPY, srcA, src1, 16);
|
||||
_viv_asm(COPY, srcB, src2, 16);
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
sum += dot(srcA, ones) + dot(srcB, ones);
|
||||
sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);
|
||||
|
|
@ -52,12 +52,12 @@ __kernel void layer_norm_BF16F32toBF16(
|
|||
vari += eps;
|
||||
vari = rsqrt(vari);
|
||||
vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
|
||||
__global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);
|
||||
__global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);
|
||||
__global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, (int2)(0, 0));
|
||||
__global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, (int2)(0, 0));
|
||||
|
||||
for(coord.x = 0; coord.x < width; coord.x += 8)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = vload4(0, bias_ptr);
|
||||
bias_f1 = vload4(1, bias_ptr);
|
||||
|
|
@ -85,7 +85,7 @@ __kernel void layer_norm_BF16F32toBF16(
|
|||
VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
|
||||
|
||||
coord_out.x = coord.x;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out, src2, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ __kernel void layer_norm_U8toF16(
|
|||
image2d_array_t output,
|
||||
float eps)
|
||||
{
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
|
||||
int4 coord = (int4)(0, get_global_id(1), get_global_id(2), get_global_id(2));
|
||||
int4 coord_out = coord;
|
||||
vxc_uchar16 src0;
|
||||
float sum = 0, sqr = 0;
|
||||
|
|
@ -41,11 +41,11 @@ __kernel void layer_norm_U8toF16(
|
|||
|
||||
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
|
||||
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
|
||||
_viv_asm(MOV, coord_out.w, baseAddr);
|
||||
_viv_asm(MOV, coord_out.z, baseAddr);
|
||||
|
||||
for(coord.x = 0; coord.x < width; coord.x += 16)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
|
||||
tmpSum += (tmpSum1.x);
|
||||
|
|
@ -71,7 +71,7 @@ __kernel void layer_norm_U8toF16(
|
|||
|
||||
for(coord.x = 0; coord.x < width; coord.x += 16)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
|
|
@ -121,7 +121,7 @@ __kernel void layer_norm_U8toF16(
|
|||
UniPackFP16even_2x8);
|
||||
_viv_asm(COPY, outval, dst, 16);
|
||||
coord_out.x = coord.x;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
|
||||
tmpData2 -= mean;
|
||||
|
|
@ -135,7 +135,7 @@ __kernel void layer_norm_U8toF16(
|
|||
UniPackFP16even_2x8);
|
||||
_viv_asm(COPY, outval, dst, 16);
|
||||
coord_out.x += 8;
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord_out, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq
|
|||
{
|
||||
for(coord.y = 0; coord.y < height;)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord.y++;
|
||||
_viv_asm(COPY, in_h, src0, 16);
|
||||
|
|
@ -134,7 +134,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
|
|||
image2d_array_t output, float eps)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int2 coord_sum = (int2)(0, gidz);
|
||||
int4 coord_para = coord;
|
||||
coord_para.z = (ushort)gidz / (ushort)(height_depth);
|
||||
|
|
@ -157,8 +158,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
|
|||
|
||||
int8 input_desc, scale_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr_a);
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
|
||||
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
|
||||
|
|
@ -175,11 +176,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
|
|||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
coord_para.y = coord.y;
|
||||
coord_bias.y = coord.y;
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = read_imagef(bias, coord_bias);
|
||||
coord_bias.x += 4;
|
||||
|
|
@ -208,7 +210,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
|
|||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
|
||||
uniConvertHalfToFp16_2x8);
|
||||
_viv_asm(COPY, outval, dst, 16);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -284,7 +286,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
|
|||
image2d_array_t output, float eps)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int2 coord_sum = (int2)(0, gidz);
|
||||
int4 coord_para = coord;
|
||||
coord_para.z = (ushort)gidz / (ushort)(height_depth);
|
||||
|
|
@ -307,8 +310,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
|
|||
|
||||
int8 input_desc, scale_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr_a);
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
|
||||
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
|
||||
|
|
@ -324,11 +327,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
|
|||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
coord_para.y = coord.y;
|
||||
coord_bias.y = coord.y;
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = read_imagef(bias, coord_bias);
|
||||
coord_bias.x += 4;
|
||||
|
|
@ -356,7 +360,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_F16to
|
|||
tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
|
||||
VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
|
||||
uniConvertInt32toUint8_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq
|
|||
{
|
||||
for(coord.y = 0; coord.y < height;)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord.y++;
|
||||
vxc_float4 sumsqr;
|
||||
|
|
@ -130,7 +130,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to
|
|||
image2d_array_t output, float eps)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int2 coord_sum = (int2)(0, gidz);
|
||||
int4 coord_para = coord;
|
||||
coord_para.z = (ushort)gidz / (ushort)(height_depth);
|
||||
|
|
@ -152,8 +153,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to
|
|||
|
||||
int8 input_desc, scale_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr_a);
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
|
||||
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
|
||||
|
|
@ -169,11 +170,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to
|
|||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
coord_para.y = coord.y;
|
||||
coord_bias.y = coord.y;
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = read_imagef(bias, coord_bias);
|
||||
coord_bias.x += 4;
|
||||
|
|
@ -199,7 +201,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_I16to
|
|||
|
||||
VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
|
||||
uniConvertInt32toUint8_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq
|
|||
{
|
||||
for(coord.y = 0; coord.y < height;)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord.y++;
|
||||
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
|
||||
|
|
@ -101,7 +101,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_sumSq
|
|||
{
|
||||
for(; coord.y < endH;)
|
||||
{
|
||||
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_ReadImage(src0, input, coord, 0,
|
||||
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
|
||||
coord.y++;
|
||||
VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
|
||||
|
|
@ -138,7 +138,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF
|
|||
image2d_array_t output, float eps)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int2 coord_sum = (int2)(0, gidz);
|
||||
int4 coord_para = coord;
|
||||
coord_para.z = (ushort)gidz / (ushort)(height_depth);
|
||||
|
|
@ -161,8 +162,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF
|
|||
|
||||
int8 input_desc, scale_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr_a);
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
|
||||
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
|
||||
|
|
@ -178,10 +179,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF
|
|||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
coord_para.y = coord.y; coord_bias.y = coord.y;
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = read_imagef(bias, coord_bias);
|
||||
coord_bias.x += 4;
|
||||
|
|
@ -208,7 +210,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF
|
|||
VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
|
||||
uniConvertHalfToFp16_2x8);
|
||||
_viv_asm(COPY, outval, dst, 16);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -242,10 +244,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toF
|
|||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src0, input, coord, 0,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_bias.y = coord.y;
|
||||
VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src1, scale, coord, 0,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = read_imagef(bias, coord_bias);
|
||||
coord_bias.x += 4;
|
||||
|
|
@ -281,7 +283,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU
|
|||
image2d_array_t output, float eps)
|
||||
{
|
||||
int gidz = get_global_id(1);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
|
||||
int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
|
||||
int2 coord_sum = (int2)(0, gidz);
|
||||
int4 coord_para = coord;
|
||||
coord_para.z = (ushort)gidz / (ushort)(height_depth);
|
||||
|
|
@ -304,8 +307,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU
|
|||
|
||||
int8 input_desc, scale_desc, output_desc;
|
||||
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
|
||||
int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord.w, baseAddr_a);
|
||||
int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
|
||||
_viv_asm(MOV, coord_in.z, baseAddr_a);
|
||||
|
||||
_viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
|
||||
int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
|
||||
|
|
@ -321,11 +324,12 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU
|
|||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_OP4(img_load_3d, src0, input, coord.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src0, input, coord_in, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_in.y ++;
|
||||
coord_para.y = coord.y;
|
||||
coord_bias.y = coord.y;
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xyww, VXC_5BITOFFSET_XY(0, 0),
|
||||
VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = read_imagef(bias, coord_bias);
|
||||
coord_bias.x += 4;
|
||||
|
|
@ -351,7 +355,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU
|
|||
|
||||
VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
|
||||
uniConvertInt32toUint8_2x8);
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord.xyzz, outval, \
|
||||
VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
|
||||
VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
|
||||
}
|
||||
}
|
||||
|
|
@ -385,10 +389,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_wh_U8toU
|
|||
|
||||
for(coord.y = 0; coord.y < height; coord.y++)
|
||||
{
|
||||
VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src0, input, coord, 0,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
coord_bias.y = coord.y;
|
||||
VXC_ReadImage(src1, scale, coord, VXC_5BITOFFSET_XY(0, 0),\
|
||||
VXC_ReadImage(src1, scale, coord, 0,\
|
||||
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
|
||||
bias_f0 = read_imagef(bias, coord_bias);
|
||||
coord_bias.x += 4;
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue