Update internal to 1.1.32

SHA: 9aa0b0f

Signed-off-by: Kainan Cha <kainan.zha@verisilicon.com>
This commit is contained in:
Kainan Cha 2021-06-23 15:26:25 +08:00
parent 98b9759663
commit 3c59694025
277 changed files with 30752 additions and 4475 deletions

View File

@ -69,7 +69,9 @@ filegroup(
name = "custom_srcs",
srcs = glob([
"src/custom/ops/*.c",
"src/custom/ops/kernel/*.c",
"src/custom/ops/kernel/evis/*.c",
"src/custom/ops/kernel/cl/*.c",
"src/custom/ops/kernel/cpu/*.c",
])
)
@ -128,7 +130,6 @@ cc_library(
"include/quantization/vsi_nn_asymmetric_affine.h",
"include/quantization/vsi_nn_dynamic_fixed_point.h",
"include/quantization/vsi_nn_perchannel_symmetric_affine.h",
"include/client/vsi_nn_vxkernel.h",
"include/interface/ops.def",
"include/kernel/vsi_nn_kernel.h",
"include/kernel/vsi_nn_gpu.h",
@ -139,6 +140,7 @@ cc_library(
"include/vsi_nn_error.h",
# libnnext
"include/libnnext/vsi_nn_vxkernel.h",
"include/libnnext/vx_lib_nnext.h",
"include/libnnext/vsi_nn_libnnext_resource.h",
@ -167,7 +169,6 @@ cc_library(
"src/vsi_nn_daemon.c",
"src/vsi_nn_graph_optimization.c",
"src/vsi_nn_pre_post_process.c",
"src/client/vsi_nn_vxkernel.c",
"src/utils/vsi_nn_link_list.c",
"src/utils/vsi_nn_util.c",
"src/utils/vsi_nn_math.c",
@ -200,12 +201,10 @@ cc_library(
"src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_topk.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c",
"src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c",
"src/libnnext/vsi_nn_libnnext_resource.c",
"src/libnnext/vsi_nn_vxkernel.c",
] + [":kernel_srcs"]
+ [":operation_srcs"]
+ [":custom_srcs"],

View File

@ -12,7 +12,6 @@ aux_source_directory(src/kernel/cpu INTERNAL_KERNEL_CPU)
aux_source_directory(src/kernel/evis INTERNAL_KERNEL_EVIS)
aux_source_directory(src/kernel/vx INTERNAL_KERNEL_VX)
aux_source_directory(src/ops INTERNAL_OPS)
aux_source_directory(src/client INTERNAL_CLIENT)
aux_source_directory(src/libnnext INTERNAL_LIBNNEXT)
aux_source_directory(src/libnnext/ops/kernel INTERNAL_LIBNNEXT_OPS_KERNEL)
aux_source_directory(src/quantization INTERNAL_QUANTIZATION)
@ -29,7 +28,6 @@ list(APPEND SRC
${INTERNAL_KERNEL_EVIS}
${INTERNAL_KERNEL_VX}
${INTERNAL_OPS}
${INTERNAL_CLIENT}
${INTERNAL_LIBNNEXT}
${INTERNAL_LIBNNEXT_OPS_KERNEL}
${INTERNAL_QUANTIZATION}

View File

@ -147,3 +147,12 @@ DEF_OP(DECONVOLUTION1D)
DEF_OP(INTERP)
DEF_OP(RESIZE_1D)
DEF_OP(UPSAMPLESCALE)
DEF_OP(GROUP_NORM)
DEF_OP(ROUND)
DEF_OP(CEIL)
DEF_OP(SEQUENCE_MASK)
DEF_OP(REPEAT)
DEF_OP(ERF)
DEF_OP(ONE_HOT)
DEF_OP(NMS)
DEF_OP(GROUPED_CONV1D)

View File

@ -244,6 +244,12 @@ vsi_bool vsi_nn_kernel_param_add_buffer
void * vsi_nn_kernel_param_get_buffer
( const vsi_nn_kernel_param_t * params, const char * key, size_t * size);
vsi_bool vsi_nn_kernel_param_add_const_buffer
( vsi_nn_kernel_param_t * params, const char * key, const void * buf, size_t size);
const void * vsi_nn_kernel_param_get_const_buffer
( const vsi_nn_kernel_param_t * params, const char * key, size_t * size);
/** Kernel register */
#define REGISTER_KERNEL_BACKEND(kernel_name, kernel_type, func) \
_INITIALIZER(_register_kernel_##kernel_name##_##kernel_type) \

View File

@ -30,17 +30,19 @@
extern "C" {
#endif
typedef struct _vsi_nn_conv1d_lcl_data_t
{
vx_tensor input_tensor;
vx_tensor weight_tensor;
vx_tensor output_tensor;
} vsi_nn_conv1d_lcl_data_t;
typedef struct _vsi_nn_conv1d_param
{
/* local data must be the first. */
vsi_nn_conv1d_lcl_data_t local;
union
{
struct _conv1d_local_data_t *local;
struct {
vx_tensor input_tensor;
vx_tensor weight_tensor;
vx_tensor output_tensor;
} reserved;
};
uint32_t ksize;
uint32_t stride;
@ -53,6 +55,8 @@ typedef struct _vsi_nn_conv1d_param
uint32_t dilation;
int32_t multiplier;
} vsi_nn_conv1d_param;
_compiler_assert(offsetof(vsi_nn_conv1d_param, local) == 0, \
vsi_nn_vsi_nn_conv1d_h );
#ifdef __cplusplus
}

View File

@ -0,0 +1,55 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_GROUPED_CONV1D_H
#define _VSI_NN_OP_GROUPED_CONV1D_H
#include "vsi_nn_types.h"
typedef struct _grouped_conv1d_local_data_t {
vsi_nn_tensor_t* input;
vsi_nn_tensor_t* weight;
vsi_nn_tensor_t* output;
} grouped_conv1d_local_data_t;
typedef struct _vsi_nn_grouped_conv1d_param
{
grouped_conv1d_local_data_t *local;
uint32_t ksize;
uint32_t stride;
/* Pad left, right, top, bottom */
uint32_t pad[2];
/* Pad type default value shall be AUTO */
vsi_nn_pad_e pad_type;
uint32_t weights;
uint32_t group;
uint32_t dilation;
int32_t multiplier;
} vsi_nn_grouped_conv1d_param;
#endif

View File

@ -0,0 +1,53 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_CLIENT_GROUPNORMALIZE_H
#define _VSI_NN_OP_CLIENT_GROUPNORMALIZE_H
#include "vsi_nn_types.h"
#include "vsi_nn_platform.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_groupnorm_lcl_data
{
/* handle 3D group norm */
vsi_nn_tensor_t *reshaped_input;
vsi_nn_tensor_t *reshaped_output;
} vsi_nn_groupnorm_lcl_data;
typedef struct _vsi_nn_groupnormalize_param
{
/* local data must be the first. */
vsi_nn_groupnorm_lcl_data* lcl_data;
float eps;
int32_t group_num;
} vsi_nn_groupnormalize_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -32,9 +32,9 @@ extern "C" {
typedef struct _vsi_nn_moments_param
{
int32_t* axis;
int32_t axis_num;
vsi_bool keep_dim;
const int32_t* axis;
int32_t axis_num;
vsi_bool keep_dim;
} vsi_nn_moments_param;
#ifdef __cplusplus

View File

@ -21,10 +21,18 @@
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_POST_H
#define _VSI_NN_POST_H
#include "post/vsi_nn_post_fasterrcnn.h"
#include "post/vsi_nn_post_cmupose.h"
#ifndef _VSI_NN_OP_NMS_H
#define _VSI_NN_OP_NMS_H
#endif
#include "vsi_nn_types.h"
typedef struct _vsi_nn_nms_param
{
int32_t max_output_size;
float iou_threshold;
float score_threshold;
float soft_nms_sigma;
} vsi_nn_nms_param;
#endif

View File

@ -0,0 +1,42 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_ONE_HOT_H
#define _VSI_NN_OP_ONE_HOT_H
#include "vsi_nn_types.h"
typedef struct _vsi_nn_one_hot_param
{
struct _one_hot_local_data_t* local;
int32_t depth;
float on_value;
float off_value;
int32_t axis;
} vsi_nn_one_hot_param;
_compiler_assert(offsetof(vsi_nn_one_hot_param, local) == 0, \
vsi_nn_one_hot_h );
#endif

View File

@ -30,12 +30,12 @@
extern "C" {
#endif
#define _VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM 3
typedef struct _vsi_nn_poolwithargmax_lcl_data
typedef struct _vsi_nn_pool_lcl_data
{
vx_tensor local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM];
} vsi_nn_poolwithargmax_lcl_data;
/* handle pool1d */
vsi_nn_tensor_t *reshaped_input;
vsi_nn_tensor_t *reshaped_output;
} vsi_nn_pool_lcl_data;
typedef struct _vsi_nn_pool_param
{
@ -49,7 +49,7 @@ typedef struct _vsi_nn_pool_param
/* Pad type default value shall be AUTO */
vsi_nn_pad_e pad_type;
/* poolwithargmax layer local data structure */
vsi_nn_poolwithargmax_lcl_data local;
vsi_nn_pool_lcl_data *local;
} vsi_nn_pool_param;
#ifdef __cplusplus

View File

@ -0,0 +1,54 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_REPEAT_H
#define _VSI_NN_OP_REPEAT_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_repeat_lcl_data
{
vsi_nn_tensor_t *repeat_tensor;
vsi_nn_tensor_t *reshaped_input;
vsi_nn_tensor_t *reshaped_output;
} vsi_nn_repeat_lcl_data;
typedef struct _vsi_nn_repeat__param
{
vsi_nn_repeat_lcl_data* local;
int32_t axis;
int32_t maxlen; // default max repeat number
int32_t* repeat_host; // host repeat array
int32_t repeat_len; // length of host repeat array
} vsi_nn_repeat_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,43 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_SEQUENCE_MASK_H
#define _VSI_NN_OP_SEQUENCE_MASK_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_sequence_mask__param
{
int32_t maxlen;
} vsi_nn_sequence_mask_param;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -32,6 +32,22 @@
extern "C" {
#endif
typedef struct _strided_slice_param
{
int32_t *begin_dims;
int32_t begin_dims_num;
int32_t *end_dims;
int32_t end_dims_num;
int32_t *stride_dims;
int32_t stride_dims_num;
int32_t begin_mask;
int32_t end_mask;
int32_t shrink_axis_mask;
int32_t new_axis_mask;
int32_t num_add_axis;
} strided_slice_param;
typedef struct _vsi_nn_strided_slice_lcl_data2
{
vsi_nn_link_list_t link_list;
@ -55,6 +71,8 @@ typedef struct _vsi_nn_strided_slice_lcl_data2
vsi_bool is_dataconvert_op;
vsi_bool is_optimized;
strided_slice_param params;
} vsi_nn_strided_slice_lcl_data2;
typedef struct _vsi_nn_strided_slice_lcl_data_t
@ -78,6 +96,7 @@ typedef struct _vsi_nn_strided_slice_param
vx_int32 begin_mask;
vx_int32 end_mask;
vx_int32 shrink_axis_mask;
int32_t new_axis_mask;
vsi_nn_strided_slice_lcl_data2 * lcl2_data;
} vsi_nn_strided_slice_param;

View File

@ -34,7 +34,7 @@ extern "C" {
typedef struct _vsi_nn_upsample_lcl_data
{
vx_tensor local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM];
vx_tensor local_tensor[_VSI_NN_UPSAMPLE_LOCAL_TENSOR_NUM];
} vsi_nn_upsample_lcl_data;
typedef struct _vsi_nn_upsample_param

View File

@ -119,7 +119,7 @@ vsi_bool is_item_in_array
enum { NAME##_INPUT_COUNT = INPUT_COUNT, \
NAME##_OUTPUT_COUNT = OUTPUT_COUNT, \
NAME##_IO_COUNT = NAME##_INPUT_COUNT + NAME##_OUTPUT_COUNT}; \
static const struct {vsi_nn_type_e types[NAME##_IO_COUNT];} \
static const struct {int types[NAME##_IO_COUNT];} \
NAME##_supported_io_types[] = {
#define DECL_OP_CONSTRAINT_REG(NAME) \

View File

@ -438,6 +438,7 @@ static inline vsi_status float32_to_dtype
case VSI_NN_TYPE_UINT8:
case VSI_NN_TYPE_INT16:
case VSI_NN_TYPE_INT32:
case VSI_NN_TYPE_UINT32:
{
int32_t dst_value = 0;
switch( dst_dtype->qnt_type )

View File

@ -165,6 +165,8 @@ struct _vsi_nn_graph
* so please keep it NULL.*/
vsi_nn_tensor_t* tensor;
} complete_signal;
vsi_bool isAllowFastMode;
};
/**
@ -716,6 +718,16 @@ OVXLIB_API vsi_status vsi_nn_SetGraphPriority
uint32_t priority
);
OVXLIB_API vsi_status vsi_nn_SetGraphFastMode
(
vsi_nn_graph_t* graph,
vsi_bool fastmode
);
OVXLIB_API vsi_bool vsi_nn_IsGraphFastMode
(
const vsi_nn_graph_t* graph
);
#ifdef __cplusplus
}
#endif

View File

@ -164,6 +164,12 @@
#include "ops/vsi_nn_op_resize_1d_bilinear_internal.h"
#include "ops/vsi_nn_op_resize_1d_nearest_internal.h"
#include "ops/vsi_nn_op_upsamplescale.h"
#include "ops/vsi_nn_op_groupnormalize.h"
#include "ops/vsi_nn_op_sequence_mask.h"
#include "ops/vsi_nn_op_repeat.h"
#include "ops/vsi_nn_op_one_hot.h"
#include "ops/vsi_nn_op_nms.h"
#include "ops/vsi_nn_op_grouped_conv1d.h"
/* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h"
@ -314,6 +320,12 @@ typedef union _vsi_nn_nn_param
vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal;
vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal;
vsi_nn_upsamplescale_param upsamplescale;
vsi_nn_groupnormalize_param groupnorm;
vsi_nn_sequence_mask_param sequence_mask;
vsi_nn_repeat_param repeat;
vsi_nn_one_hot_param one_hot;
vsi_nn_nms_param nms;
vsi_nn_grouped_conv1d_param grouped_conv1d;
uint8_t client_param[128];
/* custom node data struct define */

View File

@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 1
#define VSI_NN_VERSION_PATCH 30
#define VSI_NN_VERSION_PATCH 32
#define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -1,6 +1,4 @@
#include "cl_viv_vx_ext.h"
__kernel void vxcTopk(
__kernel void testop(
__read_only image2d_array_t input,
__write_only image2d_array_t output)
{

View File

@ -0,0 +1,194 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdlib.h>
#include <math.h>
#include "vsi_nn_types.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_test.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
//#include "libnnext/vx_lib_nnext.h"
#define _CPU_ARG_NUM (1)
#define _CPU_INPUT_NUM (1)
#define _CPU_OUTPUT_NUM (1)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME ("com.vivantecorp.extension.CustomSoftmaxVXC")
#define SCALAR_INPUT_AXIS (2)
__BEGIN_DECLS
DEF_KERNEL_EXECUTOR(_softmax_exec)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t* param,
size_t param_size
)
{
vsi_status status = VX_SUCCESS;
float* buffer[_CPU_IO_NUM] = { NULL };
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
uint32_t i = 0;
uint32_t out_elements;
int32_t sf_axis;
float fMax = 0.0;
float fProbSum = 0.0f;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &sf_axis);
CHECK_STATUS_FAIL_GOTO(status, final );
out_elements = (uint32_t)vsi_nn_kernel_tensor_attr_get_size( attr[1] );
/* alloc the float32 data buffer */
buffer[1] = (float *)malloc(out_elements * sizeof(float));
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
memset(buffer[1], 0, out_elements * sizeof(float));
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
/* Softmax implement */
for ( i = 0; i < out_elements; i++)
{
fMax = buffer[0][i] > fMax ? buffer[0][i] : fMax;
}
for ( i = 0; i < out_elements; i++)
{
buffer[1][i] = (float)expf(buffer[0][i] - fMax);
fProbSum += buffer[1][i];
}
for ( i = 0; i < out_elements; i++)
{
buffer[1][i] = buffer[1][i] / fProbSum;
}
status = vsi_nn_kernel_tensor_write_from_float(
tensors[1], attr[1], buffer[1], out_elements );
final:
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if( buffer[i] )
{
free( buffer[i] );
}
vsi_nn_kernel_tensor_attr_release( &attr[i] );
}
return status;
}
static vx_param_description_t kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
};
static const vx_kernel_description_t _kernel_info =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME,
_softmax_exec,
kernel_param_def,
_cnt_of_array( kernel_param_def ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel
)
{
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
return VSI_SUCCESS;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_SUCCESS;
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
axis = vsi_nn_kernel_param_get_int32(params, "axis");
status = _query_kernel( inputs, outputs, kernel );
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
graph, I32, &axis );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
}
else
{
status = VSI_FAILURE;
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( custom_softmax, _setup )

View File

@ -34,6 +34,7 @@ __kernel void Softmax2VXC
}
float fProbSum = 0.0f;
vxc_short8 dst;
for (int i = 0; i < sf_size; i++)
{
vxc_char8 val;
@ -47,7 +48,8 @@ __kernel void Softmax2VXC
fProbSum += fOut;
half hVal;
_viv_asm(CONV,hVal,fOut);
VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY,dst,hVal, 4);
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
for (int i = 0; i < sf_size; i++)
@ -63,7 +65,8 @@ __kernel void Softmax2VXC
float fOut =fval/fProbSum;
half hVal;
_viv_asm(CONV,hVal,fOut);
VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
_viv_asm(COPY,dst,hVal, 4);
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -0,0 +1,202 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdlib.h>
#include <math.h>
#include "vsi_nn_types.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_test.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
//#include "libnnext/vx_lib_nnext.h"
#define _CPU_ARG_NUM (1)
#define _CPU_INPUT_NUM (1)
#define _CPU_OUTPUT_NUM (1)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME ("com.vivantecorp.extension.Softmax2VXC")
#define SCALAR_INPUT_AXIS (2)
__BEGIN_DECLS
static vx_param_description_t kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
};
#define _EVIS_PARAM_NUM _cnt_of_array(kernel_param_def)
DEF_KERNEL_INITIALIZER(_softmax_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t* param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
int sf_size = 0;
vsi_nn_kernel_tensor_attr_t* attr = NULL;
// Alignment with a power of two value.
gpu_param_t gpu_param = {
2, // workdim
{0, 0, 0}, // global_offset: control the start location be processed in the image
{0, 0, 0}, // global_scale: how many pixels could be processed by a single thread
{0, 0, 0}, // local_size: local group size in thread
{0, 0, 0}}; // global_size: image size in thread
attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
if (!attr)
{
VSILOGE("Query failure! at line");
return status;
}
sf_size = attr->shape->data[0];
gpu_param.global_offset[0] = 0;
gpu_param.global_offset[1] = 0;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.local_size[0] = 1;
gpu_param.local_size[1] = 1;
gpu_param.global_size[0] =
gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
gpu_param.local_size[0]);
gpu_param.global_size[1] =
gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
gpu_param.local_size[1]);
{
gpu_dp_inst_t Uni4x4_Fp16ToFp32 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node,
"Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
vsi_nn_kernel_gpu_add_param(node,
"sf_size", &sf_size);
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
if(status != VSI_SUCCESS)
{
VSILOGE("Initializer failure!");
}
if (attr) vsi_nn_kernel_tensor_attr_release( &attr );
return status;
}
static const vx_kernel_description_t _kernel_info =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME,
NULL,
kernel_param_def,
_cnt_of_array( kernel_param_def ),
vsi_nn_KernelValidator,
NULL,
NULL,
_softmax_initializer,
vsi_nn_KernelDeinitializer
};
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel
)
{
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
"custom_softmax" );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
"custom_softmax" );
return VSI_SUCCESS;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_SUCCESS;
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t axis = 0;
axis = vsi_nn_kernel_param_get_int32(params, "axis");
status = _query_kernel( inputs, outputs, kernel );
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
graph, I32, &axis );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
}
else
{
status = VSI_FAILURE;
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( custom_softmax, _setup )

View File

@ -1,231 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdlib.h>
#include <math.h>
#include "vsi_nn_types.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_test.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vx_lib_nnext.h"
#define _VX_KERNEL_ID VX_KERNEL_ID(CUSTOM_SOFTMAX)
#define _VX_KERNEL_VAR_CPU (vx_client_kernel_CUSTOM_SOFTMAX_CPU)
#define _VX_KERNEL_VAR_VX (vx_client_kernel_CUSTOM_SOFTMAX_VX)
#define _VX_KERNEL_NAME ("com.vivantecorp.extension.CustomSoftmaxVXC")
#define _VX_KERNEL_FUNC_KERNEL (vxCustomSoftmaxKernel)
static vsi_status VX_CALLBACK vxCustomSoftmaxKernel
(
vx_node node,
const vx_reference* paramObj,
uint32_t paramNum
)
{
vsi_status status = VX_SUCCESS;
vx_tensor input = NULL,output = NULL;
float *f32_in_buffer = NULL,*f32_out_buffer=NULL;
vx_context context = NULL;
vsi_nn_tensor_attr_t in_attr,out_attr;
uint32_t i,in_elements,out_elements;
int32_t sf_axis;
float fMax = 0.0;
float fProbSum = 0.0f;
context = vxGetContext((vx_reference)node);
input = (vx_tensor)paramObj[0];
output = (vx_tensor)paramObj[1];
vxCopyScalar((vx_scalar)paramObj[2], &(sf_axis),VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
/* Fill input & output attribute data struct */
status = vsi_nn_vxGetTensorAttr(input, &in_attr);
TEST_CHECK_STATUS(status, final);
status = vsi_nn_vxGetTensorAttr(output, &out_attr);
TEST_CHECK_STATUS(status, final);
in_elements = vsi_nn_vxGetTensorElementNum(&in_attr);
out_elements = vsi_nn_vxGetTensorElementNum(&out_attr);
/* alloc the float32 data buffer */
f32_in_buffer = (float *)malloc(in_elements * sizeof(float));
f32_out_buffer= (float *)malloc(out_elements * sizeof(float));
memset(f32_in_buffer, 0, in_elements * sizeof(float));
memset(f32_out_buffer, 0, out_elements * sizeof(float));
/* Copy tensor to buffer, and convert bufer to float32 format */
status = vsi_nn_vxConvertTensorToFloat32Data(
context, input, &in_attr, f32_in_buffer, in_elements * sizeof(float));
TEST_CHECK_STATUS(status, final);
/* Softmax implement */
for ( i = 0; i < out_elements; i++)
{
fMax = f32_in_buffer[i] > fMax ? f32_in_buffer[i] : fMax;
}
for ( i = 0; i < out_elements; i++)
{
f32_out_buffer[i] = (float)expf(f32_in_buffer[i] - fMax);
fProbSum += f32_out_buffer[i];
}
for ( i = 0; i < out_elements; i++)
{
f32_out_buffer[i] = f32_out_buffer[i]/ fProbSum;
}
status = vsi_nn_vxConvertFloat32DataToTensor(
context, output, &out_attr, f32_out_buffer, out_elements * sizeof(float));
final:
if(f32_in_buffer)free(f32_in_buffer);
if(f32_out_buffer)free(f32_out_buffer);
return status;
}
static vx_status VX_CALLBACK vxCustomSoftmaxInitializer
(
vx_node nodObj,
const vx_reference *paramObj,
vx_uint32 paraNum
)
{
vx_status status = VX_SUCCESS;
/*TODO: Add initial code for VX program*/
// Alignment with a power of two value.
#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
vx_kernel_execution_parameters_t shaderParam = {
2, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
int input_size[6] = {1, 1, 1, 1, 1, 1};
int sf_size;
uint32_t input_dims;
uint32_t i;
vsi_nn_tensor_attr_t input_attr;
memset(&input_attr, 0, sizeof(vsi_nn_tensor_attr_t));
status = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[0], &input_attr);
if (status != VX_SUCCESS)
{
VSILOGE("vsi_nn_vxGetTensorAttr failure! at line %d\n", __LINE__);
return status;
}
input_dims = input_attr.dim_num;
for (i = 0; i < input_dims; i++)
{
input_size[i] = input_attr.size[i];
}
sf_size = input_size[0];
shaderParam.globalWorkOffset[0] = 0;
shaderParam.globalWorkOffset[1] = 0;
shaderParam.globalWorkScale[0] = 1;
shaderParam.globalWorkScale[1] = 1;
shaderParam.localWorkSize[0] = 1;
shaderParam.localWorkSize[1] = 1;
shaderParam.globalWorkSize[0] =
gcmALIGN((1 + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
shaderParam.globalWorkSize[1] =
gcmALIGN((1 + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
{
vx_uint32 Uni4x4_Fp16ToFp32[16] = {
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
};
vxSetNodeUniform(nodObj, "Uni4x4_Fp16ToFp32", 1, Uni4x4_Fp16ToFp32);
vxSetNodeUniform(nodObj, "sf_size", 1, &sf_size);
}
status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
&shaderParam, sizeof(vx_kernel_execution_parameters_t));
if(status < 0)
{
VSILOGE("Initializer failure!");
}
return status;
}
static vx_param_description_t s_params[] =
{
{ VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
{ VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
};
#ifdef __cplusplus
extern "C" {
#endif
vx_kernel_description_t _VX_KERNEL_VAR_CPU =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
_VX_KERNEL_FUNC_KERNEL,
s_params,
_cnt_of_array( s_params ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t _VX_KERNEL_VAR_VX =
{
_VX_KERNEL_ID,
_VX_KERNEL_NAME,
NULL,
s_params,
_cnt_of_array( s_params ),
vsi_nn_KernelValidator,
NULL,
NULL,
vxCustomSoftmaxInitializer,
vsi_nn_KernelDeinitializer
};
vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[] =
{
&_VX_KERNEL_VAR_CPU,
&_VX_KERNEL_VAR_VX,
NULL
};
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,102 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_node.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_log.h"
#include "kernel/vsi_nn_kernel.h"
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_custom_softmax_param * p;
p = &(self->nn_param.custom_softmax);
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_int32( param, "axis", p->axis );
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"custom_softmax",
inputs, 1,
outputs, 1, param );
vsi_nn_kernel_param_release( &param );
return VSI_SUCCESS;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
/*TODO: Check params. */
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * node,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
memmove(outputs[0]->attr.size, inputs[0]->attr.size,
inputs[0]->attr.dim_num * sizeof(uint32_t));
}
return TRUE;
} /* op_setup() */
#ifdef __cplusplus
extern "C" {
#endif
/* Registrar */
DEF_OP_REG
(
/* op_name */ CUSTOM_SOFTMAX,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ vsi_nn_op_common_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ 1,
/* output_num */ 1
);
#ifdef __cplusplus
}
#endif

View File

@ -1,299 +0,0 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_platform.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_node.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_log.h"
#include "client/vsi_nn_vxkernel.h"
#define _ARG_NUM (1)
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _PARAM_NUM (_ARG_NUM + _IO_NUM)
extern vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[];
static void _set_inputs_outputs
(
vx_reference * params,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
uint32_t i;
uint32_t cnt;
/* Set inputs */
cnt = 0;
for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
{
params[cnt] = (vx_reference)inputs[i]->t;
}
/* Set outputs */
for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
{
params[cnt] = (vx_reference)outputs[i]->t;
}
} /* _set_inputs_outputs() */
static vsi_status _create_params
(
vsi_nn_node_t * node,
vx_reference * params,
uint32_t num
)
{
vsi_status status;
vx_context ctx;
vsi_nn_custom_softmax_param * p;
if( 0 == num )
{
return VSI_SUCCESS;
}
memset( params, 0, sizeof( vx_reference * ) * num );
p = &(node->nn_param.custom_softmax);
ctx = vxGetContext( (vx_reference)node->graph->g );
/* Init parameters */
#define _SET_PARAM( i, type, arg ) do{ \
params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
status = vxGetStatus( params[i] ); \
if( VSI_SUCCESS != status ) { \
goto set_param_error; \
} \
} while(0)
_SET_PARAM( 0, VX_TYPE_INT32, axis );
#undef _SET_PARAM
set_param_error:
return status;
} /* _create_params */
static void _release_params
(
vx_reference * params,
uint32_t num
)
{
uint32_t i;
vx_scalar scalar;
for( i = 0; i < num; i ++ )
{
scalar = (vx_scalar)params[i];
vxReleaseScalar( &scalar );
}
} /* _release_params() */
static vsi_status cpu_op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_SUCCESS;
vx_reference params[_PARAM_NUM];
vx_reference * args;
args = &params[_IO_NUM];
if( NULL == self->n )
{
return VSI_FAILURE;
}
/* Set inputs and outputs */
_set_inputs_outputs( params, inputs, outputs );
/* Init parameters. */
_create_params( self, args, _ARG_NUM );
/* Pass parameters to node. */
status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
_release_params( args, _ARG_NUM );
return status;
}
static vsi_status vx_op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_SUCCESS;
vx_reference params[_PARAM_NUM];
vx_reference * args;
//vsi_nn_tensor_attr_t attr;
args = &params[_IO_NUM];
if( NULL == self->n )
{
return VSI_FAILURE;
}
/* Set inputs and outputs */
_set_inputs_outputs( params, inputs, outputs );
/*TODO: Add code if need to change your parameter*/
/* Init parameters. */
_create_params( self, args, _ARG_NUM );
#if 0
memcpy(&attr, &(inputs[0]->attr), sizeof(vsi_nn_tensor_attr_t));
attr.size[0] = attr.size[0];
attr.size[1] = 1;
attr.dim_num = 2;
params[0] = (vx_reference)vxReshapeTensor(inputs[0]->t, (int32_t*)(attr.size), attr.dim_num);
params[1] = (vx_reference)vxReshapeTensor(outputs[0]->t, (int32_t*)(attr.size), attr.dim_num);
#endif
/* Init parameters. */
_create_params( self, args, _ARG_NUM );
/* Pass parameters to node. */
status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
_release_params( args, _ARG_NUM );
#if 0
vxReleaseTensor((vx_tensor*)&params[0]);
vxReleaseTensor((vx_tensor*)&params[1]);
#endif
return status;
}
static vsi_nn_op_compute_t op_compute_list[] =
{
cpu_op_compute,
vx_op_compute,
NULL
};
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status;
vsi_nn_kernel_info_t kernel_info;
char *path = NULL;
memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
status = VSI_FAILURE;
kernel_info.type = VX_KERNEL_TYPE_CPU;
kernel_info.kernel = vx_kernel_CUSTOM_SOFTMAX_list;
kernel_info.resource_num = 1;
kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
kernel_info.resource_name[0] = "vsi_nn_kernel_custom_softmax";
path = getenv("USER_VX_SOURCE_PATH");
if(path)
{
vsi_nn_VxResourceSetPath(path);
}
if( kernel_info.type == VX_KERNEL_TYPE_VX)
{
kernel_info.kernel_index = 1;
kernel_info.init_index = 1;
}
else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/
{
kernel_info.kernel_index = 0;
kernel_info.init_index = 0;
}
self->n = vsi_nn_RegisterClientKernelAndNewNode(
self->graph, &kernel_info);
if (kernel_info.resource_name)
{
free(kernel_info.resource_name);
}
if( NULL == self->n )
{
return VSI_FAILURE;
}
if (NULL != op_compute_list[kernel_info.init_index])
{
status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
}
return status;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
/*TODO: Check input tensor shapes. */
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * node,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
/* TODO: Compute output tensor shape. */
if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
outputs[0]->attr.size[0] = inputs[0]->attr.size[0];
outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
}
return TRUE;
} /* op_setup() */
#ifdef __cplusplus
extern "C" {
#endif
/* Registrar */
DEF_OP_REG
(
/* op_name */ CUSTOM_SOFTMAX,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ vsi_nn_op_common_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
#ifdef __cplusplus
}
#endif

View File

@ -183,26 +183,31 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
int i;
int32_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if(input_dtype == I8)
if (input_dtype == I8)
{
input_dtype = I32;
}
if (output_dtype == I16)
{
output_dtype = I32;
}
key = HASH_ARGMAX_KEY( axis, input_dtype, output_dtype, image_2d );
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
{
if( kernel_map[i].key == key )
if ( kernel_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(kernel_map) )
if ( i < _cnt_of_array(kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = kernel_param_def;
@ -237,7 +242,7 @@ static vsi_nn_kernel_node_t _setup
axis = vsi_nn_kernel_param_get_int32(params, "axis");
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
inputs[0]->attr.dim_num )
|| !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num )
@ -250,11 +255,11 @@ static vsi_nn_kernel_node_t _setup
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
inputs, 1, outputs, 1 );

View File

@ -183,20 +183,26 @@ static vsi_status _query_kernel
vsi_nn_kernel_dtype_e output_dtype;
vsi_status status = VSI_FAILURE;
uint32_t key;
int i;
int32_t i;
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (output_dtype == I16)
{
output_dtype = I32;
}
key = HASH_ARGMIN_KEY( axis, input_dtype, output_dtype, image_2d );
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
{
if( kernel_map[i].key == key )
if ( kernel_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(kernel_map) )
if ( i < _cnt_of_array(kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = kernel_param_def;
@ -231,7 +237,7 @@ static vsi_nn_kernel_node_t _setup
axis = vsi_nn_kernel_param_get_int32(params, "axis");
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
inputs[0]->attr.dim_num )
|| !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num )
@ -244,11 +250,11 @@ static vsi_nn_kernel_node_t _setup
image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
inputs, 1, outputs, 1 );

View File

@ -186,7 +186,7 @@ static vsi_status _query_kernel
{
in_dtype = F32;
}
else if ((I8 == in_dtype) || (I16 == in_dtype))
else if ((I8 == in_dtype) || (BOOL8 == in_dtype) || (I16 == in_dtype))
{
in_dtype = I32;
}

View File

@ -289,6 +289,12 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8)
{
output_dtype = BOOL8;
}
key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d );
for( i = 0; i < _cnt_of_array(_comparisons_cl_kernel_map); i ++ )

View File

@ -48,6 +48,7 @@ typedef enum
UNARY_NEG,
UNARY_HSIGMOID,
UNARY_MISH,
UNARY_ROUND,
} unary_type_e;
/*
@ -91,7 +92,8 @@ typedef enum
#define ELU_OPERATION elu
#define NEG_OPERATION neg
#define HSIGMOID_OPERATION hard_sigmoid
#define MISH_OPERATION mish
#define MISH_OPERATION mish
#define ROUND_OPERATION round
static const struct {
uint32_t key;
@ -113,6 +115,8 @@ static const struct {
TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION, UNARY_MISH, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION, UNARY_ROUND, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F16, F16)
@ -128,6 +132,8 @@ static const struct {
TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION, UNARY_MISH, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16)
TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, U8)
TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, U8)
@ -136,6 +142,7 @@ static const struct {
TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, U8, U8)
TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
TENSOR_UNARY_KERNELS(MISH_OPERATION, UNARY_MISH, U8, U8)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, U8)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8)
@ -144,6 +151,7 @@ static const struct {
TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, U8, U8)
TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8, U8)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, U8, U8)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8)
TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32, I32)
@ -157,6 +165,7 @@ static const struct {
#undef NEG_OPERATION
#undef HSIGMOID_OPERATION
#undef MISH_OPERATION
#undef ROUND_OPERATION
/*
* Kernel params
*/
@ -407,5 +416,5 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( elu, UNARY_ELU )
REGISTER_ELTWISE_UNARY_BACKEND_CL( neg, UNARY_NEG )
REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
REGISTER_ELTWISE_UNARY_BACKEND_CL( mish, UNARY_MISH )
REGISTER_ELTWISE_UNARY_BACKEND_CL( round, UNARY_ROUND )
__END_DECLS

View File

@ -0,0 +1,328 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define HASH_UNARY_KEY(_input_type, _output_type, _image_2d) \
( (_input_type << 12) | (_output_type << 4) | (_image_2d))
#define VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() \
"erf"
#define HASH_UNARY_SH_KERNEL_NAME( SRC_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.erf_"#SRC_TYPE"to"#DST_TYPE)
#define TENSOR_UNARY_KERNELS(SRC_TYPE, OUT_TYPE) \
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \
HASH_UNARY_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE), \
VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
#define HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.erf_"#SRC_TYPE"to"#DST_TYPE"_2D")
#define TENSOR_UNARY_KERNELS_2D(SRC_TYPE, OUT_TYPE) \
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \
HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE), \
VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
#define TENSOR_UNARY_KERNELS_FLOAT(SRC_TYPE, OUT_TYPE) \
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \
HASH_UNARY_SH_KERNEL_NAME(F32, F32), \
VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
#define TENSOR_UNARY_KERNELS_FLOAT_2D(SRC_TYPE, OUT_TYPE) \
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \
HASH_UNARY_SH_KERNEL_2D_NAME(F32, F32), \
VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _erf_kernel_map[] =
{
// Register kernel here
TENSOR_UNARY_KERNELS_FLOAT(F32, F32)
TENSOR_UNARY_KERNELS_FLOAT(F16, F16)
TENSOR_UNARY_KERNELS_FLOAT_2D(F32, F32)
TENSOR_UNARY_KERNELS_FLOAT_2D(F16, F16)
TENSOR_UNARY_KERNELS(U8, U8)
TENSOR_UNARY_KERNELS_2D(U8, U8)
};
/*
* Kernel params
*/
static vx_param_description_t _erf_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define SCALAR_INPUT_SCALE (2)
#define SCALAR_INPUT_TAIL (3)
#define SCALAR_OUTPUT_SCALE (4)
#define SCALAR_OUTPUT_ZP (5)
#define _ERF_PARAM_NUM _cnt_of_array( _erf_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_erf_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0} // globalWorkSize: image size in thread
};
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_int_array_t * out_shape = NULL;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
out_shape = attr[1]->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(attr[0]);
SAFE_FREE_TENSOR_ATTR(attr[1]);
#undef SAFE_FREE_TENSOR_ATTR
return status;
} /* _erf_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool image_2d
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _erf_kernel_map;
size_t kernel_map_size = _cnt_of_array( _erf_kernel_map );
vx_param_description_t * param_def = _erf_kernel_param_def;
vx_kernel_initialize_f initializer = _erf_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_UNARY_KEY( in_dtype, out_dtype, image_2d );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _erf_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
int32_t new_rank = 0;
vsi_bool ret = FALSE;
vsi_bool image_2d = FALSE;
float inputScale = inputs[0]->attr.dtype.scale;
float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
float outputScale = outputs[0]->attr.dtype.scale;
float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
ret = vsi_nn_kernel_optimize_element_shape(
(int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num,
shape, &new_rank );
if ( ret )
{
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], (uint32_t*)shape, new_rank );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shape, new_rank );
}
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size,
rs_tensors[0]->attr.dim_num ) )
{
return NULL;
}
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1);
status = _query_kernel( kernel, inputs, outputs, image_2d );
if ( VSI_SUCCESS == status )
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
rs_tensors, 1, &rs_tensors[1], 1 );
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &inputScale );
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
graph, F32, &inputTail );
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &outputScale );
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &outputZP );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
CHECK_STATUS_FAIL_GOTO( status, OnError );
}
}
OnError:
if (rs_tensors[0])
{
vsi_nn_ReleaseTensor( &rs_tensors[0] );
}
if (rs_tensors[1])
{
vsi_nn_ReleaseTensor( &rs_tensors[1] );
}
if (node_params[SCALAR_INPUT_SCALE])
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
}
if (node_params[SCALAR_INPUT_TAIL])
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
}
if (node_params[SCALAR_OUTPUT_SCALE])
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
}
if (node_params[SCALAR_OUTPUT_ZP])
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( erf, _setup )

View File

@ -68,11 +68,15 @@ static const _kernel_map_type _floordiv_kernel_map[] =
// Register kernel here
FLOORDIV_KERNELS( F32, F32, F32 )
FLOORDIV_KERNELS( I32, I32, I32 )
FLOORDIV_KERNELS( I32, I32, U8 )
FLOORDIV_KERNELS( U8, U8, U8 )
FLOORDIV_KERNELS( U8, I32, U8 )
FLOORDIV_KERNELS_2D( F32, F32, F32 )
FLOORDIV_KERNELS_2D( I32, I32, I32 )
FLOORDIV_KERNELS_2D( I32, I32, U8 )
FLOORDIV_KERNELS_2D( U8, U8, U8 )
FLOORDIV_KERNELS_2D( U8, I32, U8 )
};
@ -311,4 +315,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CL( floordiv, _setup )

View File

@ -0,0 +1,760 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum
{
INTERNAL_KERNEL_SUM_SQR,
INTERNAL_KERNEL_MEAN_VARI,
INTERNAL_KERNEL_NORM,
} _internal_kernel_e;
#define KERNEL_SOURCE_1 "group_normalization_u8"
#define KERNEL_SOURCE_2 "group_normalization_f32"
#define KERNEL_SOURCE_3 "group_normalization_i32"
// Add kernel hashtable here
#define HASH_GROUPNORM_SUM_SQR_KERNEL_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("cl.group_norm_sumsqr_"#SRC0_TYPE)
#define HASH_GROUPNORM_SUM_SQR_KERNEL_2D_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("cl.group_norm_sumsqr_"#SRC0_TYPE"_2D")
#define HASH_GROUPNORM_MEAN_VARI_KERNEL_NAME \
CVIVANTE_NAMESPACE("cl.group_norm_meanvari")
#define HASH_GROUPNORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.group_norm_"#SRC0_TYPE"to"#DST_TYPE)
#define HASH_GROUPNORM_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
// Add kernel hashtable here
// sum sqr
#define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
#define TENSOR_GROUPNORM_SUM_SQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 0), \
HASH_GROUPNORM_SUM_SQR_KERNEL_NAME(IN0_TYPE), \
SOURCE },
#define TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 1), \
HASH_GROUPNORM_SUM_SQR_KERNEL_2D_NAME(IN0_TYPE), \
SOURCE },
#define HASH_GROUPNORM_MEAN_VARI_KEY(_input0_type, _output_type) \
((_input0_type << 24) | (_output_type << 16))
#define TENSOR_GROUPNORM_MEAN_VARI_KERNELS(SOURCE) \
{ HASH_GROUPNORM_MEAN_VARI_KEY(F32, F32), \
HASH_GROUPNORM_MEAN_VARI_KERNEL_NAME, \
SOURCE },
// normalization
#define HASH_GROUPNORM_KEY(_input0_type, _output_type, _reshape_flag) \
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
#define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
HASH_GROUPNORM_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 1), \
HASH_GROUPNORM_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _groupnorm_sum_sqr_kernel_map[] =
{
// Register kernel here
TENSOR_GROUPNORM_SUM_SQR_KERNELS( U8, F32, KERNEL_SOURCE_1 )
TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
TENSOR_GROUPNORM_SUM_SQR_KERNELS( F32, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_SUM_SQR_KERNELS( I32, F32, KERNEL_SOURCE_3 )
TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
};
static const _kernel_map_type _groupnorm_mean_vari_kernel_map[] =
{
// Register kernel here
TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_2 )
};
static const _kernel_map_type _groupnorm_kernel_map[] =
{
// Register kernel here
TENSOR_GROUPNORM_KERNELS( U8, U8, KERNEL_SOURCE_1 )
TENSOR_GROUPNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_1 )
TENSOR_GROUPNORM_KERNELS( U8, F32, KERNEL_SOURCE_1 )
TENSOR_GROUPNORM_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
TENSOR_GROUPNORM_KERNELS( F32, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
TENSOR_GROUPNORM_KERNELS( I32, I32, KERNEL_SOURCE_3 )
TENSOR_GROUPNORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 )
TENSOR_GROUPNORM_KERNELS( I32, F32, KERNEL_SOURCE_3 )
TENSOR_GROUPNORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
};
/*
* Kernel params
*/
static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _GROUPNORM_SUM_SQR_PARAM_NUM _cnt_of_array( _groupnorm_sum_sqr_kernel_param_def )
static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _GROUPNORM_MEAN_VARI_PARAM_NUM _cnt_of_array( _groupnorm_mean_vari_kernel_param_def )
static vx_param_description_t _groupnorm_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _GROUPNORM_PARAM_NUM _cnt_of_array( _groupnorm_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_int_array_t * input_shape = NULL;
int32_t width = 0;
int32_t chn = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
input_shape = attr[0]->shape;
width = input_shape->data[0];
chn = attr[1]->shape->data[1];
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.local_size[0] = 16;
gpu_param.local_size[1] = 1;
gpu_param.local_size[2] = 1;
gpu_param.global_size[0] = (width + 15) / 16 * 16;
gpu_param.global_size[1] = chn;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _group_normalization_sum_sqr_initializer() */
DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
int32_t chn = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
chn = attr[0]->shape->data[1];
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.local_size[0] = 16;
gpu_param.local_size[1] = 1;
gpu_param.local_size[2] = 1;
gpu_param.global_size[0] = 16;
gpu_param.global_size[1] = chn;
gpu_param.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _group_normalization_sum_sqr_initializer() */
DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_int_array_t * input_shape = NULL;
int32_t width = 0;
int32_t height = 0;
int32_t chn = 0;
int32_t is2D = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &is2D);
CHECK_STATUS_FAIL_GOTO(status, final );
input_shape = attr[0]->shape;
width = input_shape->data[0];
height = input_shape->data[1];
chn = attr[1]->shape->data[1];
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.local_size[0] = 16;
gpu_param.local_size[1] = 1;
gpu_param.local_size[2] = 1;
gpu_param.global_size[0] = (width + 15) / 16 * 16;
gpu_param.global_size[1] = height;
gpu_param.global_size[2] = chn;
if (is2D)
{
gpu_param.global_size[0] = (width + 15) / 16 * 16;
gpu_param.global_size[1] = chn;
gpu_param.global_size[2] = 1;
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _groupnorm_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
const uint32_t hashkey,
_internal_kernel_e kernel_id
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
vx_kernel_initialize_f initializer = NULL;
vx_param_description_t * param_def = NULL;
const _kernel_map_type* kernel_map;
size_t kernel_map_size = 0;
size_t param_size = 0;
uint32_t i = 0;
switch( kernel_id )
{
case INTERNAL_KERNEL_SUM_SQR:
initializer = _groupnorm_sum_sqr_initializer;
kernel_map = _groupnorm_sum_sqr_kernel_map;
kernel_map_size = _cnt_of_array( _groupnorm_sum_sqr_kernel_map );
param_def = _groupnorm_sum_sqr_kernel_param_def;
param_size = _GROUPNORM_SUM_SQR_PARAM_NUM;
break;
case INTERNAL_KERNEL_MEAN_VARI:
initializer = _groupnorm_mean_vari_initializer;
kernel_map = _groupnorm_mean_vari_kernel_map;
kernel_map_size = _cnt_of_array( _groupnorm_mean_vari_kernel_map );
param_def = _groupnorm_mean_vari_kernel_param_def;
param_size = _GROUPNORM_MEAN_VARI_PARAM_NUM;
break;
case INTERNAL_KERNEL_NORM:
initializer = _groupnorm_initializer;
kernel_map = _groupnorm_kernel_map;
kernel_map_size = _cnt_of_array( _groupnorm_kernel_map );
param_def = _groupnorm_kernel_param_def;
param_size = _GROUPNORM_PARAM_NUM;
break;
default:
VSI_ASSERT( FALSE );
return VSI_FAILURE;
}
for( i = 0; i < kernel_map_size; i ++ )
{
if ( kernel_map[i].key == hashkey )
{
break;
}
}
if ( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static int32_t _optimize_gn_shape_cl
(
vsi_nn_tensor_t ** inputs,
int32_t group_size,
int32_t group_num,
int32_t* opt_shape,
int32_t* is2D_flg
)
{
vsi_status status = VSI_SUCCESS;
int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
int32_t new_rank = 0;
group_shape[0] = inputs[0]->attr.size[0];
group_shape[1] = inputs[0]->attr.size[1];
group_shape[2] = group_size;
vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
if (opt_shape[1] == 1)
{
opt_shape[1] = group_num;
opt_shape[2] = 1;
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
is2D_flg[0] = 1;
}
else if (new_rank == 2)
{
opt_shape[2] = group_num;
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
}
else
{
status = VSI_FAILURE;
}
return status;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
#define INTERNAL_KERNEL_SIZE (2)
#define SUM_SQR_INDEX (0)
#define MEAN_VARI_INDEX (1)
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t sum_sqr_node_params[_GROUPNORM_SUM_SQR_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_param_t mean_vari_node_params[_GROUPNORM_MEAN_VARI_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_param_t node_params[_GROUPNORM_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL;
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_dtype_e in0_dtype = U8;
vsi_nn_kernel_dtype_e out_dtype = U8;
vsi_nn_tensor_attr_t attr;
vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
int32_t is2D_flg = 0;
uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
uint32_t hashkey = 0;
int32_t i = 0;
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" );
int32_t group_size = inputs[0]->attr.size[2] / group_num;
int32_t width = inputs[0]->attr.size[0];
int32_t height = inputs[0]->attr.size[1];
int32_t group_stride = 1;
float input_zp = 0;
float input_scale = 1.0f;
int32_t input_fl = 0;
float output_zp = 0;
float output_scale = 1.0f;
int32_t output_fl = 0;
float rSpaceOrg = 1.0f / (width * height);
float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _optimize_gn_shape_cl(inputs, group_size, group_num, new_shape, &is2D_flg);
if ( VSI_SUCCESS != status )
{
goto final;
}
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
width = new_shape[0];
height = is2D_flg > 0 ? 1 : new_shape[1];
group_stride = ((width + 15) / 16) * 4;
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
input_zp = (float)inputs[0]->attr.dtype.zero_point;
input_scale = inputs[0]->attr.dtype.scale;
}
else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
input_fl = inputs[0]->attr.dtype.fl;
if (input_fl > 0)
{
input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
}
else
{
input_scale = ((float) ((int64_t)1 << -input_fl));
}
input_zp = 0.0f;
}
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
output_zp = (float)outputs[0]->attr.dtype.zero_point;
output_scale = 1.0f / outputs[0]->attr.dtype.scale;
}
else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
output_fl = outputs[0]->attr.dtype.fl;
if (output_fl > 0)
{
output_scale = (float)((int64_t)1 << output_fl);
}
else
{
output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
}
output_zp = 0.0f;
}
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
{
ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
// Assign unique_id
ikernels[i]->unique_id = kernel->unique_id;
}
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
attr.is_const = FALSE;
attr.vtl = TRUE;
attr.size[0] = ((new_shape[0] + 15) / 16) * 4;
attr.size[1] = group_num;
attr.size[2] = 1;
attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
attr.dim_num = 4;
tensors[SUM_SQR_INDEX] = vsi_nn_CreateTensor( graph, &attr );
attr.size[0] = 4;
tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (in0_dtype == F16)
{
in0_dtype = F32;
}
if (out_dtype == F16)
{
out_dtype = F32;
}
hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg );
hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 );
hashkey = HASH_GROUPNORM_KEY( in0_dtype, out_dtype, is2D_flg );
status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR );
if ( VSI_SUCCESS != status )
{
goto final;
}
status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
if ( VSI_SUCCESS != status )
{
goto final;
}
status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
if ( VSI_SUCCESS != status )
{
goto final;
}
// Sum Sqr
tmp_node = vsi_nn_kernel_create_node( graph, ikernels[SUM_SQR_INDEX] );
if (tmp_node)
{
uint32_t index = 0;
sum_sqr_node_params[index++] = rs_input;
sum_sqr_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
status = vsi_nn_kernel_node_pass_param( tmp_node, sum_sqr_node_params,
_GROUPNORM_SUM_SQR_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[2] );
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[3] );
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[4] );
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[5] );
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[6] );
vsi_nn_kernel_scalar_release( &sum_sqr_node_params[7] );
vsi_nn_kernel_node_release( &tmp_node );
}
// mean vari
tmp_node1 = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
if (tmp_node1)
{
uint32_t index = 0;
mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio );
mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_stride );
status = vsi_nn_kernel_node_pass_param( tmp_node1, mean_vari_node_params,
_GROUPNORM_MEAN_VARI_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] );
vsi_nn_kernel_node_release( &tmp_node1 );
}
// Nomalization
node = vsi_nn_kernel_create_node( graph, kernel );
if (node)
{
uint32_t index = 0;
int32_t pStride = 0;
if (!is2D_flg)
{
pStride = inputs[1]->attr.size[0] / new_shape[1];
rSpaceOrg = 1.0f / (new_shape[0] / pStride);
}
node_params[index++] = rs_input;
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
node_params[index++] = rs_output;
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rSpaceOrg );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pStride );
status = vsi_nn_kernel_node_pass_param( node, node_params,
_GROUPNORM_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
vsi_nn_kernel_scalar_release( &node_params[8] );
vsi_nn_kernel_scalar_release( &node_params[9] );
vsi_nn_kernel_scalar_release( &node_params[10] );
vsi_nn_kernel_scalar_release( &node_params[11] );
vsi_nn_kernel_scalar_release( &node_params[12] );
vsi_nn_kernel_scalar_release( &node_params[13] );
vsi_nn_kernel_scalar_release( &node_params[14] );
}
/* Pass parameters to node. */
final:
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
{
if ( ikernels[i] )
{
vsi_nn_kernel_release( &ikernels[i] );
}
if ( tensors[i] )
{
vsi_nn_ReleaseTensor( &tensors[i] );
}
}
#undef INTERNAL_KERNEL_SIZE
#undef SUM_SQR_INDEX
#undef MEAN_VARI_INDEX
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( group_norm, _setup )

View File

@ -176,19 +176,19 @@ static int32_t get_moments_output_reshape_size
}
sizes[3] = out_dims_num > 3 ? output_size[3] : 1;
if(axis_num == 1 && axis[0] == 0)
if (axis_num == 1 && axis[0] == 0)
{
sizes[0] = output_size[1];
sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
out_rs_flg = 1;
}
else if(axis_num == 1 && axis[0] == 1)
else if (axis_num == 1 && axis[0] == 1)
{
sizes[0] = output_size[0];
sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
out_rs_flg = 1;
}
else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1)
else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
{
sizes[0] = out_dims_num > 2 ? output_size[2] : 1;
out_rs_flg = 1;
@ -240,25 +240,25 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
if(axis_num == 1 && axis == 0)
if (axis_num == 1 && axis == 0)
{
gpu_param.global_size[0] = gpu_align_p2((height + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = chn;
}
else if(axis_num == 1 && axis == 1)
else if (axis_num == 1 && axis == 1)
{
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = chn;
}
else if(axis_num == 1 && axis == 2)
else if (axis_num == 1 && axis == 2)
{
gpu_param.global_size[0] = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = height;
}
else if(axis_num == 2)
else if (axis_num == 2)
{
gpu_param.local_size[0] = 16;
gpu_param.local_size[1] = 1;
@ -266,7 +266,7 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
gpu_param.global_size[0] = 16;
gpu_param.global_size[1] = chn;
}
else if(axis_num == 3)
else if (axis_num == 3)
{
gpu_param.local_size[0] = 16;
gpu_param.local_size[1] = 1;
@ -315,13 +315,13 @@ static vsi_status _query_kernel
for( i = 0; i < _cnt_of_array(moments_map); i ++ )
{
if( moments_map[i].key == key )
if ( moments_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(moments_map) )
if ( i < _cnt_of_array(moments_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", moments_map[i].function_name );
kernel->info.parameters = _moments_kernel_param_def;
@ -354,6 +354,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t out_shape[VSI_NN_MAX_DIM_NUM] = {0};
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
int32_t out_rs_flg = 0;
int32_t axis_num = 0;
size_t axis_num_temp = 0;
@ -362,6 +363,7 @@ static vsi_nn_kernel_node_t _setup
int32_t first_axis = axis[0];
int32_t i = 0;
vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL};
vsi_nn_kernel_tensor_t reshape_tensors[3] = { NULL };
int32_t width = inputs[0]->attr.size[0];
int32_t height = inputs[0]->attr.size[1];
@ -372,7 +374,7 @@ static vsi_nn_kernel_node_t _setup
axis_num = (int32_t)axis_num_temp;
if(inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
if (inputs[0]->attr.dtype.fl > 0)
{
@ -385,38 +387,52 @@ static vsi_nn_kernel_node_t _setup
input_zp = 0;
}
if(axis_num == 1 && axis[0] == 0)
if (axis_num == 1 && axis[0] == 0)
{
dim_ratio = (float)1.0 / (float)(width);
}
else if(axis_num == 1 && axis[0] == 1)
else if (axis_num == 1 && axis[0] == 1)
{
dim_ratio = (float)1.0 / (float)(height);
}
else if(axis_num == 1 && axis[0] == 2)
else if (axis_num == 1 && axis[0] == 2)
{
dim_ratio = (float)1.0 / (float)(chn);
}
else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1)
else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
{
dim_ratio = (float)1.0 / (float)(width * height);
}
else if(axis_num == 3)
else if (axis_num == 3)
{
dim_ratio = (float)1.0 / (float)(width * height * chn);
}
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
if(keep_dim)
if (keep_dim)
{
out_rs_flg = get_moments_output_reshape_size(&outputs[0], out_shape, axis, axis_num);
}
if (inputs[0]->attr.dim_num < 2)
{
shape[0] = inputs[0]->attr.size[0];
shape[1] = 1;
reshape_tensors[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 2 );
}
if (outputs[0]->attr.dim_num < 2)
{
shape[0] = outputs[0]->attr.size[0];
shape[1] = 1;
reshape_tensors[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 2 );
reshape_tensors[2] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, shape, 2 );
}
scalar_list[AXIS] = vsi_nn_kernel_scalar_create( graph, I32, &first_axis );
scalar_list[AXIS_NUM] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
scalar_list[ZP] = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
@ -427,19 +443,31 @@ static vsi_nn_kernel_node_t _setup
scalar_list[DIMRATIO] = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
status = _query_kernel( inputs, outputs, kernel, params, axis, axis_num, 0 );
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 0;
/* Pass parameters to node. */
node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t);
if(out_rs_flg)
if (reshape_tensors[0])
{
node_params[index++] = reshape_tensors[0];
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t);
}
if (out_rs_flg)
{
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, out_shape, 4 );
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, out_shape, 4 );
}
else if (reshape_tensors[1])
{
node_params[index++] = reshape_tensors[1];
node_params[index++] = reshape_tensors[2];
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[0]->t);
@ -455,7 +483,7 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = scalar_list[DIMRATIO];
status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM );
CHECK_STATUS(status);
if(out_rs_flg)
if (out_rs_flg)
{
vsi_nn_kernel_tensor_release( &node_params[1] );
vsi_nn_kernel_tensor_release( &node_params[2] );
@ -465,10 +493,22 @@ static vsi_nn_kernel_node_t _setup
}
}
if (reshape_tensors[0])
{
vsi_nn_kernel_tensor_release( &reshape_tensors[0] );
}
if (reshape_tensors[1])
{
vsi_nn_kernel_tensor_release( &reshape_tensors[1] );
}
if (reshape_tensors[2])
{
vsi_nn_kernel_tensor_release( &reshape_tensors[2] );
}
/* Pass parameters to node. */
for( i = 0; i < INTERNAL_MOMENTS_SCALAR_NUM; i ++ )
{
if(scalar_list[i])
if (scalar_list[i])
{
vsi_nn_kernel_scalar_release( &scalar_list[i] );
}

View File

@ -0,0 +1,332 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum
{
INTERNAL_KERNEL_ONE_HOT,
} _internal_kernel_e;
#define _ONE_HOT_KERNEL_SOURCE "one_hot"
// Add kernel hashtable here
#define HASH_ONE_HOT_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.one_hot_"#SRC_TYPE"to"#DST_TYPE)
#define ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
(( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
#define PACK_ONE_HOT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
HASH_ONE_HOT_SH_KERNEL_NAME( IN_DTYPE, OUT_DTYPE ), \
_ONE_HOT_KERNEL_SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _one_hot_kernel_map[] =
{
// Register kernel here
PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
PACK_ONE_HOT_KERNEL_MAP( U8, U8 ),
};
/*
* Kernel params
*/
static vx_param_description_t _one_hot_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define SCALAR_INPUT_DEPTH (2)
#define SCALAR_INPUT_ON_VALUE (3)
#define SCALAR_INPUT_OFF_VALUE (4)
#define SCALAR_INPUT_SCALE (5)
#define SCALAR_INPUT_TAIL (6)
#define _ONE_HOT_PARAM_NUM _cnt_of_array( _one_hot_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_one_hot_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
2, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0} // globalWorkSize: image size in thread
};
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_int_array_t * in_shape = NULL;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
in_shape = attr[0]->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_size[0] = gpu_align_p2(
(in_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = in_shape->data[1];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(attr[0]);
SAFE_FREE_TENSOR_ATTR(attr[1]);
#undef SAFE_FREE_TENSOR_ATTR
return status;
} /* _one_hot_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _one_hot_kernel_map;
size_t kernel_map_size = _cnt_of_array( _one_hot_kernel_map );
vx_param_description_t * param_def = _one_hot_kernel_param_def;
vx_kernel_initialize_f initializer = _one_hot_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (in_dtype == F16)
{
in_dtype = F32;
}
if (out_dtype == F16)
{
out_dtype = F32;
}
else if (out_dtype == I16 || out_dtype == I8)
{
out_dtype = I32;
}
key = ONE_HOT_HASH_KEY( in_dtype, out_dtype );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _one_hot_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
int32_t i = 0;
int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
int32_t prefix_dim_size = 1;
int32_t suffix_dim_size = 0;
int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
vsi_nn_kernel_dtype_e out_dtype;
uint32_t data[2] = {0};
float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
float inputScale = inputs[0]->attr.dtype.scale;
float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (out_dtype != F32 && out_dtype != F16)
{
vsi_nn_Float32ToDtype(on_value, (uint8_t*)&data[0], &outputs[0]->attr.dtype);
vsi_nn_Float32ToDtype(off_value, (uint8_t*)&data[1], &outputs[0]->attr.dtype);
}
else
{
data[0] = *(uint32_t*)&on_value;
data[1] = *(uint32_t*)&off_value;
}
axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;
for (i = 0; i < axis; i++)
{
prefix_dim_size *= inputs[0]->attr.size[i];
}
suffix_dim_size = num_elements / prefix_dim_size;
shape[0][0] = suffix_dim_size;
shape[0][1] = prefix_dim_size;
shape[1][0] = suffix_dim_size;
shape[1][1] = depth;
shape[1][2] = prefix_dim_size;
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], (uint32_t*)shape[0], 2 );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shape[1], 3 );
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
rs_tensors[1]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
&rs_tensors[0], input_num, &rs_tensors[1], output_num );
node_params[SCALAR_INPUT_DEPTH] = vsi_nn_kernel_scalar_create(
graph, I32, &depth );
node_params[SCALAR_INPUT_ON_VALUE] = vsi_nn_kernel_scalar_create(
graph, U32, &data[0] );
node_params[SCALAR_INPUT_OFF_VALUE] = vsi_nn_kernel_scalar_create(
graph, U32, &data[1] );
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &inputScale );
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
graph, F32, &inputTail );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
CHECK_STATUS_FAIL_GOTO( status, final );
}
}
final:
if (rs_tensors[0])
{
vsi_nn_ReleaseTensor( &rs_tensors[0] );
}
if (rs_tensors[1])
{
vsi_nn_ReleaseTensor( &rs_tensors[1] );
}
for (i = SCALAR_INPUT_DEPTH; i < _ONE_HOT_PARAM_NUM; i++)
{
if (node_params[i])
{
vsi_nn_kernel_scalar_release( &node_params[i] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( one_hot, _setup )

View File

@ -178,11 +178,19 @@ static vsi_status _query_kernel
{
in_dtype = F32;
}
else if (I16 == in_dtype && inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
{
in_dtype = I32;
}
if (F16 == out_dtype)
{
out_dtype = F32;
}
else if (I16 == out_dtype && outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
{
out_dtype = I32;
}
key = HASH_REDUCEMAX_HASH_KEY( axis, in_dtype, out_dtype, image_2d );

View File

@ -0,0 +1,407 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "repeat"
// Add kernel hashtable here
#define HASH_REPEAT_KERNEL_NAME(SRC0_TYPE, AXIS) \
CVIVANTE_NAMESPACE("cl.repeat_"#SRC0_TYPE"_axis"#AXIS)
#define HASH_REPEAT_KERNEL_1D_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("cl.repeat_"#SRC0_TYPE"_1D")
// Add kernel hashtable here
#define HASH_REPEAT_KEY(_input0_type, _output_type, _is1d, _axis) \
((_input0_type << 24) | (_output_type << 16) | (_is1d << 8) | _axis)
#define TENSOR_REPEAT_KERNELS(IN0_TYPE, OUT_TYPE, AXIS, SOURCE) \
{ HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 0, AXIS), \
HASH_REPEAT_KERNEL_NAME(IN0_TYPE, AXIS), \
SOURCE },
#define TENSOR_REPEAT_1D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 1, 0), \
HASH_REPEAT_KERNEL_1D_NAME(IN0_TYPE), \
SOURCE },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _repeat_kernel_map[] =
{
// Register kernel here
TENSOR_REPEAT_KERNELS( I32, I32, 0, KERNEL_SOURCE_1 )
TENSOR_REPEAT_KERNELS( I32, I32, 1, KERNEL_SOURCE_1 )
TENSOR_REPEAT_KERNELS( I32, I32, 2, KERNEL_SOURCE_1 )
TENSOR_REPEAT_KERNELS( F32, F32, 0, KERNEL_SOURCE_1 )
TENSOR_REPEAT_KERNELS( F32, F32, 1, KERNEL_SOURCE_1 )
TENSOR_REPEAT_KERNELS( F32, F32, 2, KERNEL_SOURCE_1 )
TENSOR_REPEAT_1D_KERNELS( I32, I32, KERNEL_SOURCE_1 )
TENSOR_REPEAT_1D_KERNELS( F32, F32, KERNEL_SOURCE_1 )
};
/*
* Kernel params
*/
static vx_param_description_t _repeat_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _REPEAT_PARAM_NUM _cnt_of_array( _repeat_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_repeat_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_int_array_t * input_shape = NULL;
int32_t height = 0, width = 0, chn = 0;
int32_t is1d = 0;
int32_t axis = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &axis);
CHECK_STATUS_FAIL_GOTO(status, final );
input_shape = attr[0]->shape;
width = input_shape->data[0];
height = input_shape->data[1];
if (height == 1 && input_shape->size == 2)
{
is1d = 1;
}
chn = input_shape->size > 2 ? input_shape->data[2] : 1;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = width;
gpu_param.global_size[1] = height;
gpu_param.global_size[2] = chn;
if (is1d || axis == 1)
{
gpu_param.global_size[0] = 1;
}
else if (axis == 0)
{
gpu_param.global_size[1] = 1;
}
else if (axis == 2)
{
gpu_param.global_size[2] = 1;
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _repeat_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t axis
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
uint32_t key = 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (input0_dtype == F16)
{
input0_dtype = F32;
}
if (output_dtype == F16)
{
output_dtype = F32;
}
key = HASH_REPEAT_KEY( input0_dtype, output_dtype, is1d, axis );
for( i = 0; i < _cnt_of_array(_repeat_kernel_map); i ++ )
{
if ( _repeat_kernel_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(_repeat_kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _repeat_kernel_map[i].function_name );
kernel->info.parameters = _repeat_kernel_param_def;
kernel->info.numParams = _REPEAT_PARAM_NUM;
kernel->info.initialize = _repeat_initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
_repeat_kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
_repeat_kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static int32_t _optimize_repeat_shape
(
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
int32_t* axis,
int32_t* opt_shape_in,
int32_t* opt_shape_out,
int32_t* new_rank
)
{
vsi_status status = VSI_SUCCESS;
if (inputs[0]->attr.dim_num == 1)
{
opt_shape_in[0] = inputs[0]->attr.size[0];
opt_shape_in[1] = 1;
opt_shape_out[0] = outputs[0]->attr.size[0];
opt_shape_out[1] = 1;
new_rank[0] = 2;
new_rank[1] = 2;
}
else if (axis[0] == 3)
{
vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank );
if (opt_shape_in[1] == 1)
{
opt_shape_in[1] = inputs[0]->attr.size[3];
opt_shape_out[0] = opt_shape_in[0];
opt_shape_out[1] = outputs[0]->attr.size[3];
axis[0] = 0;
new_rank[0] = 2;
new_rank[1] = 2;
}
else if (new_rank[0] == 2)
{
opt_shape_in[2] = inputs[0]->attr.size[3];
opt_shape_out[0] = opt_shape_in[0];
opt_shape_out[1] = opt_shape_in[1];
opt_shape_out[2] = outputs[0]->attr.size[3];
axis[0] = 2;
new_rank[0] = 3;
new_rank[1] = 3;
}
else
{
status = VSI_FAILURE;
}
}
return status;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_REPEAT_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL;
int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
int32_t new_rank[2] = {0, 0};
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
int32_t width = inputs[0]->attr.size[0];
int32_t height = inputs[0]->attr.dim_num > 1 ? inputs[0]->attr.size[1] : 1;
int32_t channel = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
if (axis > 2 || outputs[0]->attr.dim_num == 1)
{
status = _optimize_repeat_shape(inputs, outputs, &axis, new_shape[0], new_shape[1], new_rank);
if ( VSI_SUCCESS != status )
{
goto final;
}
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], new_rank[0]);
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], new_rank[1]);
width = new_shape[0][0];
height = new_shape[0][1];
channel = new_rank[0] > 2 ? new_shape[0][2]: 1;
}
if (inputs[1]->attr.dim_num == 1)
{
new_shape[0][0] = inputs[1]->attr.size[0];
new_shape[0][1] = 1;
rs_input1 = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[0], 2);
}
status = _query_kernel( kernel, inputs, outputs, axis );
if ( VSI_SUCCESS != status )
{
goto final;
}
node = vsi_nn_kernel_create_node( graph, kernel );
if (node)
{
uint32_t index = 0;
if (rs_input)
{
node_params[index++] = rs_input;
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
}
if (rs_input1)
{
node_params[index++] = rs_input1;
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
}
if (rs_output)
{
node_params[index++] = rs_output;
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
}
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &channel );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
status = vsi_nn_kernel_node_pass_param( node, node_params,
_REPEAT_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
}
/* Pass parameters to node. */
final:
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_input1)
{
vsi_nn_kernel_tensor_release( &rs_input1 );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( repeat, _setup )

View File

@ -0,0 +1,354 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "math.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "sequence_mask"
#define HASH_SEQUENCE_MASK_KEY(_input0_type, _output_type, _image_2d) \
((_input0_type << 24) | (_output_type << 8) | (_image_2d))
#define HASH_SEQUENCE_MASK_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.sequence_mask_"#SRC0_TYPE"to"#DST_TYPE)
#define HASH_SEQUENCE_MASK_SH_2DKERNEL_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("cl.sequence_mask_"#SRC0_TYPE"to"#DST_TYPE"_2D")
#define TENSOR_SEQUENCE_MASK_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 0), \
HASH_SEQUENCE_MASK_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_SEQUENCE_MASK_2D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 1), \
HASH_SEQUENCE_MASK_SH_2DKERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} kernel_map[] =
{
TENSOR_SEQUENCE_MASK_KERNELS(I32, U8, KERNEL_SOURCE_1)
TENSOR_SEQUENCE_MASK_KERNELS(I32, I32, KERNEL_SOURCE_1)
TENSOR_SEQUENCE_MASK_KERNELS(I32, F32, KERNEL_SOURCE_1)
TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, U8, KERNEL_SOURCE_1)
TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, I32, KERNEL_SOURCE_1)
TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, F32, KERNEL_SOURCE_1)
};
/*
* Kernel params
*/
static vx_param_description_t kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CL_PARAM_NUM _cnt_of_array(kernel_param_def)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
vsi_int_array_t * out_shape = NULL;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
out_shape = attr[0]->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1];
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _sequence_mask_initializer() */
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
int32_t is2Dflg
)
{
vsi_nn_kernel_dtype_e input0_dtype = I32;
vsi_nn_kernel_dtype_e output_dtype = U8;
vsi_status status = VSI_FAILURE;
uint32_t key = 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (output_dtype == BOOL8)
{
output_dtype= U8;
}
key = HASH_SEQUENCE_MASK_KEY( input0_dtype, output_dtype, is2Dflg );
for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = kernel_param_def;
kernel->info.numParams = _cnt_of_array( kernel_param_def );
kernel->info.initialize = _sequence_mask_initializer;
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static int32_t _optimize_mask_shape
(
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
int32_t max_len,
int32_t* opt_shape_in,
int32_t* opt_shape_out,
int32_t* is2Dflg
)
{
vsi_status status = VSI_SUCCESS;
int32_t in_shape[VSI_NN_MAX_DIM_NUM] = {0};
int32_t new_rank = 0;
uint32_t i = 0;
for(i = 0; i < inputs[0]->attr.dim_num; i++)
{
in_shape[i] = inputs[0]->attr.size[i];
}
vsi_nn_kernel_optimize_element_shape( in_shape, inputs[0]->attr.dim_num, opt_shape_in, &new_rank );
if (new_rank > 2)
{
return VSI_FAILURE;
}
opt_shape_out[0] = max_len;
for(i = 0; i < (uint32_t)new_rank; i++)
{
opt_shape_out[i + 1] = opt_shape_in[i];
}
if (opt_shape_out[2] == 1)
{
is2Dflg[0] = 1;
}
return status;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" );
vsi_nn_kernel_node_t node = NULL;
int32_t is2Dflg = 0;
float input_zp = 0;
float input_scale = 1.0f;
int32_t output_zp = 0;
float output_scale = 1.0f;
float input_zpScale = 0;
float outputVal1 = 1.0f;
int32_t input_fl = 0;
int32_t output_fl = 0;
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1], &is2Dflg);
if ( VSI_SUCCESS != status )
{
goto final;
}
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2);
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 4);
if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
input_zp = (float)inputs[0]->attr.dtype.zero_point;
input_scale = inputs[0]->attr.dtype.scale;
}
else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
input_fl = inputs[0]->attr.dtype.fl;
if (input_fl > 0)
{
input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
}
else
{
input_scale = ((float) ((int64_t)1 << -input_fl));
}
input_zp = 0.0f;
}
if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
output_zp = outputs[0]->attr.dtype.zero_point;
output_scale = 1.0f / outputs[0]->attr.dtype.scale;
}
else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
{
output_fl = outputs[0]->attr.dtype.fl;
if (output_fl > 0)
{
output_scale = (float)((int64_t)1 << output_fl);
}
else
{
output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
}
output_zp = 0;
}
input_zpScale = input_scale * input_zp;
outputVal1 = output_scale + (float)output_zp;
status = _query_kernel( inputs, outputs, kernel, is2Dflg );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 0;
node_params[index++] = rs_input;
node_params[index++] = rs_output;
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zpScale );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputVal1 );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
}
}
final:
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( sequence_mask, _setup )

View File

@ -0,0 +1,308 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum
{
INTERNAL_KERNEL_SLICE,
} _internal_kernel_e;
#define _SLICE_KERNEL_SOURCE "slice"
#define SLICE_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
CVIVANTE_NAMESPACE("cl.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE)
// Add kernel hashtable here
#define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D) \
(( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D))
#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
#define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
CVIVANTE_NAMESPACE("cl.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D")
#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
{ SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _slice_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( F32, I32, F32, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP( I32, I32, I32, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_2D( F32, I32, F32, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_2D( I32, I32, I32, _SLICE_KERNEL_SOURCE ),
PACK_KERNEL_MAP_2D( U8, I32, U8, _SLICE_KERNEL_SOURCE ),
};
#define _INPUT_NUM (2)
#define _OUTPUT_NUM (1)
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
/*
* Kernel params
*/
static vx_param_description_t _slice_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _SLICE_PARAM_NUM _cnt_of_array( _slice_kernel_param_def )
#define SCALAR_INPUT_SCALE (3)
#define SCALAR_INPUT_TAIL (4)
#define SCALAR_OUTPUT_SCALE (5)
#define SCALAR_OUTPUT_ZP (6)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_slice_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_int_array_t * out_shape = NULL;
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
out_shape = output_attr->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(output_attr);
return status;
} /* _slice_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
vsi_bool image_2d
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in0_dtype;
vsi_nn_kernel_dtype_e in1_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _slice_kernel_map;
size_t kernel_map_size = _cnt_of_array( _slice_kernel_map );
vx_param_description_t * param_def = _slice_kernel_param_def;
size_t param_def_size = _cnt_of_array( _slice_kernel_param_def );
vx_kernel_initialize_f initializer = _slice_initializer;
uint32_t key;
uint32_t i;
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (F16 == in0_dtype)
{
in0_dtype = F32;
}
if (F16 == out_dtype)
{
out_dtype = F32;
}
key = SLICE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
vsi_bool image_2d = FALSE;
uint32_t rank[_IO_NUM] = {0};
int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
int32_t i = 0;
int32_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
int32_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
float inputScale = inputs[0]->attr.dtype.scale;
float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
float outputScale = outputs[0]->attr.dtype.scale;
float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
shapes[0], &rank[0]);
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
shapes[1], &rank[1]);
vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num,
shapes[2], &rank[2]);
for (i = 0; i < _INPUT_NUM; i++)
{
reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
inputs[i], (uint32_t*)shapes[i], rank[i] );
}
reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] );
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
inputs[0]->attr.dim_num ) || input_batch != output_batch )
{
return NULL;
}
image_2d = (rank[0] < 3 || shapes[0][2] == 1);
status = _query_kernel( kernel, inputs, outputs, image_2d );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM,
reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &inputScale );
node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
graph, F32, &inputTail );
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
graph, F32, &outputScale );
node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
graph, F32, &outputZP );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM );
VSI_ASSERT( status == VSI_SUCCESS );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( slice, _setup )

View File

@ -36,7 +36,7 @@
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS
#define _CPU_ARG_NUM (1)

View File

@ -36,7 +36,7 @@
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -0,0 +1,279 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (4)
#define _OUTPUT_NUM (1)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.axis_aligned_bbox_transform")
typedef struct vsi_nn_box_encoding_corner_t
{
float x1, y1, x2, y2;
}vsi_nn_box_encoding_corner;
typedef struct vsi_nn_box_encoding_center_t
{
float w, h, x, y;
}vsi_nn_box_encoding_center;
/*
* Kernel params
*/
static vx_param_description_t _axis_aligned_bbox_transform_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
};
#define _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def )
static void _to_box_encoding_corner
(
vsi_nn_box_encoding_center* ctr,
vsi_nn_box_encoding_corner* cnr
)
{
cnr->x1 = ctr->x - ctr->w / 2;
cnr->y1 = ctr->y - ctr->h / 2;
cnr->x2 = ctr->x + ctr->w / 2;
cnr->y2 = ctr->y + ctr->h / 2;
}
static void _to_box_encoding_center
(
vsi_nn_box_encoding_corner* cnr,
vsi_nn_box_encoding_center* ctr
)
{
ctr->w = cnr->x2 - cnr->x1;
ctr->h = cnr->y2 - cnr->y1;
ctr->x = (cnr->x1 + cnr->x2) / 2;
ctr->y = (cnr->y1 + cnr->y2) / 2;
}
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
float *f32_in_buffer[_INPUT_NUM] = {NULL};
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
size_t out_elements[_OUTPUT_NUM] = {0};
size_t out_bytes[_OUTPUT_NUM] = {0};
uint32_t i;
const uint32_t roiLength = 4;
const uint32_t imageLength = 2;
uint32_t numClasses = 0;
uint32_t numRois = 0;
uint32_t j;
uint32_t roiIndex;
/* prepare data */
for (i = 0; i < _INPUT_NUM; i ++)
{
input[i] = (vsi_nn_kernel_tensor_t)param[i];
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
}
for (i = 0; i < _OUTPUT_NUM; i ++)
{
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
out_bytes[i] = out_elements[i] * sizeof(float);
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
memset( f32_out_buffer[i], 0, out_bytes[i] );
}
numClasses = in_attr[1]->shape->data[0] / roiLength;
numRois = in_attr[0]->shape->data[1];
for (roiIndex = 0; roiIndex < numRois; roiIndex++)
{
uint32_t batchIndex = (uint32_t)f32_in_buffer[2][roiIndex];
float imageHeight = f32_in_buffer[3][batchIndex * imageLength];
float imageWidth = f32_in_buffer[3][batchIndex * imageLength + 1];
vsi_nn_box_encoding_corner roi_cnr;
vsi_nn_box_encoding_center roiBefore;
roi_cnr.x1 = f32_in_buffer[0][roiIndex * roiLength];
roi_cnr.y1 = f32_in_buffer[0][roiIndex * roiLength + 1];
roi_cnr.x2 = f32_in_buffer[0][roiIndex * roiLength + 2];
roi_cnr.y2 = f32_in_buffer[0][roiIndex * roiLength + 3];
_to_box_encoding_center(&roi_cnr, &roiBefore);
for (j = 0; j < numClasses; j++)
{
vsi_nn_box_encoding_center roi_ctr;
vsi_nn_box_encoding_corner roiAfter;
vsi_nn_box_encoding_corner cliped;
uint32_t index = (roiIndex * numClasses + j) * roiLength;
roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w);
roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h);
roi_ctr.x = roiBefore.x + f32_in_buffer[1][index] * roiBefore.w;
roi_ctr.y = roiBefore.y + f32_in_buffer[1][index + 1] * roiBefore.h;
_to_box_encoding_corner(&roi_ctr, &roiAfter);
cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth);
cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight);
cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth);
cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight);
f32_out_buffer[0][index] = cliped.x1;
f32_out_buffer[0][index + 1] = cliped.y1;
f32_out_buffer[0][index + 2] = cliped.x2;
f32_out_buffer[0][index + 3] = cliped.y2;
}
}
/* save data */
for(i = 0; i < _OUTPUT_NUM; i++)
{
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
f32_out_buffer[i], out_elements[i] );
CHECK_STATUS_FAIL_GOTO( status, final );
}
final:
for (i = 0; i < _INPUT_NUM; i++)
{
if (f32_in_buffer[i])
{
free(f32_in_buffer[i]);
f32_in_buffer[i] = NULL;
}
if (in_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
}
}
for (i = 0; i < _OUTPUT_NUM; i++)
{
if (f32_out_buffer[i])
{
free(f32_out_buffer[i]);
f32_out_buffer[i] = NULL;
}
if (out_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
}
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _axis_aligned_bbox_transform_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def );
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM,
inputs, input_num, outputs, output_num );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( axis_aligned_bbox_transform, _setup )

View File

@ -34,7 +34,7 @@
#include "vsi_nn_tensor.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS
@ -164,8 +164,8 @@ DEF_KERNEL_EXECUTOR(_comparisons_exec)
buffer[2][i] = (float)data;
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
buffer[1], out_elements );
status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
buffer[2], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:

View File

@ -0,0 +1,264 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.conv1d_ovxlib")
/*
* Kernel params
*/
static vx_param_description_t _conv1d_ovxlib_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CONV1D_OVXLIB_PARAM_NUM _cnt_of_array( _conv1d_ovxlib_kernel_param_def )
#define _IO_COUNT (4)
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
int i = 0;
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t tensors[_IO_COUNT] = { NULL };
vsi_nn_kernel_tensor_attr_t* attr[_IO_COUNT] = { NULL };
float* buffer[_IO_COUNT] = { NULL };
int32_t stride = 0;
int32_t pad_front = 0;
int32_t pad_end = 0;
int32_t dilation = 0;
int32_t overflow_policy = 0;
int32_t rounding_policy = 0;
int32_t down_scale_size_rounding = 0;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[3], "Create input buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &stride);
CHECK_STATUS_FAIL_GOTO(status, final);
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &pad_front);
CHECK_STATUS_FAIL_GOTO(status, final);
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &pad_end);
CHECK_STATUS_FAIL_GOTO(status, final);
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation);
CHECK_STATUS_FAIL_GOTO(status, final);
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &overflow_policy);
CHECK_STATUS_FAIL_GOTO(status, final);
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rounding_policy);
CHECK_STATUS_FAIL_GOTO(status, final);
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &down_scale_size_rounding);
CHECK_STATUS_FAIL_GOTO(status, final);
{
int32_t batch = attr[0]->shape->data[2];
int32_t input_channel = attr[0]->shape->data[1];
int32_t input_height = attr[0]->shape->data[0];
int32_t kernel_size = attr[1]->shape->data[0];
int32_t output_channel = attr[1]->shape->data[2];
int32_t output_height = attr[3]->shape->data[0];
int32_t batch_index = 0;
int32_t input_channel_index = 0;
int32_t output_channel_index = 0;
int32_t output_h_index = 0;
for(batch_index = 0; batch_index < batch; batch_index++)
{
float* per_batch_input = buffer[0] + batch_index * input_channel * input_height;
float* per_batch_output = buffer[3] + batch_index * output_channel * output_height;
for(output_channel_index = 0; output_channel_index < output_channel; output_channel_index++)
{
float* filter = buffer[1] + output_channel_index * input_channel * kernel_size;
for(output_h_index = 0; output_h_index < output_height; output_h_index++)
{
float output_value = 0.;
float* current_value_ptr = per_batch_input + output_h_index * stride;
for(input_channel_index = 0; input_channel_index < input_channel; input_channel_index++)
{
int k = 0;
int32_t index = 0;
for(k = 0; k < kernel_size; k++)
{
float w = *(filter + input_channel_index * kernel_size + k);
float v = *(current_value_ptr + input_channel_index * input_height + index);
output_value += w * v;
index += dilation;
}
}
if(buffer[2])
{
output_value += buffer[2][output_channel_index];
}
*(per_batch_output + output_channel_index * output_height + output_h_index) = output_value;
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
buffer[3], batch * output_channel * output_height );
CHECK_STATUS_FAIL_GOTO( status, final );
}
final:
for( i = 0; i < _IO_COUNT; i ++ )
{
if( buffer[i] )
{
free( buffer[i] );
}
vsi_nn_kernel_tensor_attr_release( &attr[i] );
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _conv1d_ovxlib_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _conv1d_ovxlib_kernel_param_def );
return VSI_SUCCESS;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CONV1D_OVXLIB_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
int j = 0;
int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" );
int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" );
int32_t overflow_policy = vsi_nn_kernel_param_get_int32( params, "overflow_policy" );
int32_t rounding_policy = vsi_nn_kernel_param_get_int32( params, "rounding_policy" );
int32_t down_scale_size_rounding = vsi_nn_kernel_param_get_int32( params, "down_scale_size_rounding" );
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _CONV1D_OVXLIB_PARAM_NUM,
inputs, input_num, outputs, output_num );
j = (int)(input_num + output_num);
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &stride );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_front );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_end );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &dilation );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &overflow_policy );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &rounding_policy );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &down_scale_size_rounding );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CONV1D_OVXLIB_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( conv1d_ovxlib, _setup )

View File

@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS
@ -46,6 +46,7 @@ typedef enum
UNARY_NEG,
UNARY_HSIGMOID,
UNARY_MISH,
UNARY_ROUND,
} unary_type_e;
@ -101,6 +102,13 @@ static float mish_eval(float data)
return data;
}
static float round_eval(float data)
{
data = (float)(vsi_rtne(data));
return data;
}
DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
(
vsi_nn_kernel_node_t node,
@ -165,6 +173,9 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
case UNARY_MISH:
data = mish_eval(data);
break;
case UNARY_ROUND:
data = round_eval(data);
break;
default:
break;
}
@ -298,3 +309,4 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu, UNARY_ELU )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg, UNARY_NEG )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish, UNARY_MISH )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( round, UNARY_ROUND )

View File

@ -0,0 +1,229 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.erf")
/*
* Kernel params
*/
static vx_param_description_t _erf_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _ERF_PARAM_NUM _cnt_of_array( _erf_kernel_param_def )
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
float *f32_in_buffer[_INPUT_NUM] = {NULL};
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
size_t out_elements[_OUTPUT_NUM] = {0};
size_t out_bytes[_OUTPUT_NUM] = {0};
size_t i = 0;
/* prepare data */
for (i = 0; i < _INPUT_NUM; i ++)
{
input[i] = (vsi_nn_kernel_tensor_t)param[i];
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
}
for (i = 0; i < _OUTPUT_NUM; i ++)
{
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
out_bytes[i] = out_elements[i] * sizeof(float);
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
memset( f32_out_buffer[i], 0, out_bytes[i] );
}
#define ERF_PI 3.141592653589793
for (i = 0; i < out_elements[0]; i ++)
{
/* 2 / sqrt(pi) * (sum[(-1)^n! * x ^ (2n + 1)] + x) */
float x = f32_in_buffer[0][i];
float res = 0;
float tmp = x;
float factorial = 1; /*n!*/
float x_pow = x;
int32_t one = 1;
int32_t n = 1;
while (vsi_abs(tmp) > 1e-5)
{
res += tmp;
factorial *= n;
one *= -1;
x_pow *= x * x;
tmp = one / factorial * x_pow / ( 2 * n + 1);
n ++;
}
res *= 2.0f / (float)sqrt(ERF_PI);
f32_out_buffer[0][i] = res;
}
/* save data */
for(i = 0; i < _OUTPUT_NUM; i++)
{
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
f32_out_buffer[i], out_elements[i] );
CHECK_STATUS_FAIL_GOTO( status, final );
}
final:
for (i = 0; i < _INPUT_NUM; i++)
{
if (f32_in_buffer[i])
{
free(f32_in_buffer[i]);
f32_in_buffer[i] = NULL;
}
if (in_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
}
}
for (i = 0; i < _OUTPUT_NUM; i++)
{
if (f32_out_buffer[i])
{
free(f32_out_buffer[i]);
f32_out_buffer[i] = NULL;
}
if (out_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
}
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _erf_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _erf_kernel_param_def );
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
status = _query_kernel( kernel, inputs, outputs);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
inputs, input_num, outputs, output_num );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( erf, _setup )

View File

@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -0,0 +1,315 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _CPU_ARG_NUM (2)
#define _CPU_INPUT_NUM (3)
#define _CPU_OUTPUT_NUM (1)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.group_norm")
DEF_KERNEL_EXECUTOR(_group_norm_exec)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
float * buffer[_CPU_IO_NUM] = { NULL };
size_t out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
uint32_t i = 0;
int32_t spaceOrg = 0;
float eps = .0f;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps);
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &spaceOrg);
CHECK_STATUS_FAIL_GOTO(status, final );
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final );
buffer[3] = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
memset( buffer[3], 0, out_elements * sizeof(float) );
{
uint32_t b = 0, c = 0;
uint32_t height = attr[0]->shape->data[1];
uint32_t width = attr[0]->shape->data[0];
uint32_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
uint32_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
uint32_t spatial = height * width;
for (b = 0; b < bh; b++)
{
for (c = 0; c < ch; c++)
{
uint32_t page = c * spatial + b * (spatial * ch);
uint32_t paraIdx = c * attr[1]->shape->data[0];
float sum = .0f;
float sumsq = .0f;
float mean = .0f;
float vari = .0f;
float data = 0;
for (i = 0; i < spatial; i++)
{
uint32_t index = page + i;
sum += buffer[0][index];
}
mean = sum / spatial;
for (i = 0; i < spatial; i++)
{
uint32_t index = page + i;
data = buffer[0][index] - mean;
sumsq += data * data;
}
vari = sumsq / spatial;
vari = (float)(1.0 / sqrtf(vari + eps));
for (i = 0; i < spatial; i++)
{
float normVal = 0;
uint32_t index = page + i;
uint32_t tmpIdx = paraIdx + i / spaceOrg;
float scaleVal = buffer[2][tmpIdx];
float biasVal = buffer[1][tmpIdx];
data = buffer[0][index] - mean;
normVal = data * vari * scaleVal + biasVal;
buffer[3][index] = normVal;
}
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
buffer[3], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if ( buffer[i] )
{
free( buffer[i] );
}
}
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
}
return status;
} /* _group_norm_exec() */
/*
* Kernel params
*/
static vx_param_description_t _group_normalization_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _GROUP_NORMALIZATION_PARAM_NUM _cnt_of_array( _group_normalization_kernel_param_def )
static const vx_kernel_description_t _kernel_info =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME,
_group_norm_exec,
_group_normalization_kernel_param_def,
_cnt_of_array( _group_normalization_kernel_param_def ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel
)
{
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
return VSI_SUCCESS;
} /* _query_kernel() */
static int32_t _optimize_gn_shape_cpu
(
vsi_nn_tensor_t ** inputs,
int32_t group_size,
int32_t group_num,
int32_t* opt_shape
)
{
vsi_status status = VSI_SUCCESS;
int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
int32_t new_rank = 0;
group_shape[0] = inputs[0]->attr.size[0];
group_shape[1] = inputs[0]->attr.size[1];
group_shape[2] = group_size;
vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
if (new_rank == 2)
{
opt_shape[2] = group_num;
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
}
else
{
status = VSI_FAILURE;
}
return status;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" );
int32_t group_size = inputs[0]->attr.size[2] / group_num;
int32_t spaceOrg = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
status = _optimize_gn_shape_cpu(inputs, group_size, group_num, new_shape);
if ( VSI_SUCCESS != status )
{
goto final;
}
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
status = _query_kernel( inputs, outputs, kernel );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
uint32_t index = 0;
/* Set inputs and outputs */
backend_params[index++] = rs_input;
backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
backend_params[index++] = rs_output;
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &spaceOrg );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
CHECK_STATUS( status );
vsi_nn_kernel_scalar_release( &backend_params[4] );
vsi_nn_kernel_scalar_release( &backend_params[5] );
}
else
{
status = VSI_FAILURE;
}
}
final:
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( group_norm, _setup )

View File

@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS
@ -143,8 +143,8 @@ DEF_KERNEL_EXECUTOR(_layer_norm_exec)
{
int idx = (outer * axisSize + i) * innerSize + inner;
float data = buffer[0][idx] - mean;
float scaleVal = buffer[2][idx];
float biasVal = buffer[1][idx];
float scaleVal = buffer[2][i];
float biasVal = buffer[1][i];
float normVal = data * vari * scaleVal + biasVal;
buffer[3][idx] = normVal;
}

View File

@ -36,7 +36,7 @@
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS
#define _CPU_ARG_NUM (2)

View File

@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -36,7 +36,7 @@
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -0,0 +1,441 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (2)
#define _OUTPUT_NUM (3)
#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.nms")
/*
* Kernel params
*/
static vx_param_description_t _nms_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define SCALAR_INPUT_MAX_SIZE (5)
#define SCALAR_INPUT_IOU_THRES (6)
#define SCALAR_INPUT_SCORE_THRES (7)
#define SCALAR_INPUT_SOFT_NMS_SIGMA (8)
#define _NMS_PARAM_NUM _cnt_of_array( _nms_kernel_param_def )
typedef struct Candidate_s
{
int index;
float score;
int suppress_begin_index;
}Candidate;
static void _swap_element
(
Candidate* list,
uint32_t first,
uint32_t second
)
{
Candidate temp;
memcpy(&temp, &list[first], sizeof(Candidate));
memcpy(&list[first], &list[second], sizeof(Candidate));
memcpy(&list[second], &temp, sizeof(Candidate));
}
static uint32_t _max_element
(
Candidate* list,
uint32_t len
)
{
uint32_t i;
uint32_t max_index = 0;
float max_val = list[0].score;
for ( i = 1; i < len; i++ )
{
float val = list[i].score;
if ( max_val < val )
{
max_val = val;
max_index = i;
}
}
return max_index;
}
typedef struct box_corner_encoding_s
{
float y1;
float x1;
float y2;
float x2;
}box_corner_encoding;
static float _computeIntersectionOverUnion
(
const float* boxes,
const int32_t i,
const int32_t j
)
{
box_corner_encoding box_i = ((box_corner_encoding *)boxes)[i];
box_corner_encoding box_j = ((box_corner_encoding *)boxes)[j];
const float box_i_y_min = vsi_nn_min(box_i.y1, box_i.y2);
const float box_i_y_max = vsi_nn_max(box_i.y1, box_i.y2);
const float box_i_x_min = vsi_nn_min(box_i.x1, box_i.x2);
const float box_i_x_max = vsi_nn_max(box_i.x1, box_i.x2);
const float box_j_y_min = vsi_nn_min(box_j.y1, box_j.y2);
const float box_j_y_max = vsi_nn_max(box_j.y1, box_j.y2);
const float box_j_x_min = vsi_nn_min(box_j.x1, box_j.x2);
const float box_j_x_max = vsi_nn_max(box_j.x1, box_j.x2);
const float area_i =
(box_i_y_max - box_i_y_min) * (box_i_x_max - box_i_x_min);
const float area_j =
(box_j_y_max - box_j_y_min) * (box_j_x_max - box_j_x_min);
const float intersection_ymax = vsi_nn_min(box_i_y_max, box_j_y_max);
const float intersection_xmax = vsi_nn_min(box_i_x_max, box_j_x_max);
const float intersection_ymin = vsi_nn_max(box_i_y_min, box_j_y_min);
const float intersection_xmin = vsi_nn_max(box_i_x_min, box_j_x_min);
const float intersection_area =
vsi_nn_max(intersection_ymax - intersection_ymin, 0.0f) *
vsi_nn_max(intersection_xmax - intersection_xmin, 0.0f);
if (area_i <= 0 || area_j <= 0)
{
return 0.0f;
}
return intersection_area / (area_i + area_j - intersection_area);
}
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VX_SUCCESS;
vsi_nn_kernel_tensor_t tensors[_INPUT_NUM] = { NULL };
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
float * buffer[_INPUT_NUM] = { NULL };
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
size_t stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
size_t out_elements[_OUTPUT_NUM] = {0};
vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM] = { NULL };
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
int32_t i = 0;
int32_t num_boxes = 0;
float* boxes = NULL;
float* scores = NULL;
float* selected_indices = NULL;
float* selected_scores = NULL;
float* num_selected_indices = NULL;
Candidate * candidate = NULL;
int32_t select_size = 0;
int32_t max_output_size = 0;
int32_t select_start = 0;
int32_t select_len = 0;
float iou_threshold = 0.f;
float score_threshold = 0.f;
float soft_nms_sigma = 0.f;
float scale = 0;
int32_t num_outputs = 0;
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_MAX_SIZE],
&max_output_size);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_IOU_THRES],
&iou_threshold);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SCORE_THRES],
&score_threshold);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SOFT_NMS_SIGMA],
&soft_nms_sigma);
CHECK_STATUS_FAIL_GOTO(status, final );
for ( i = 0; i < _INPUT_NUM; i++)
{
tensors[i] = (vsi_nn_kernel_tensor_t)param[i];
attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
vsi_nn_kernel_tensor_attr_get_stride( attr[i], stride_size[i] );
buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[i], "Create input buffer fail.", final );
}
for ( i = 0; i < _OUTPUT_NUM; i++)
{
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
f32_out_buffer[i] = (float *)malloc( out_elements[i] * sizeof(float) );
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
memset( f32_out_buffer[i], 0, out_elements[i] * sizeof(float) );
}
num_boxes = attr[0]->shape->data[1];
boxes = buffer[0];
scores = buffer[1];
selected_indices = f32_out_buffer[0];
selected_scores = f32_out_buffer[1];
num_selected_indices = f32_out_buffer[2];
candidate = (Candidate*)malloc(num_boxes * sizeof(Candidate));
CHECK_PTR_FAIL_GOTO( candidate, "Create select buffer fail.", final );
memset(candidate, 0, num_boxes * sizeof(Candidate));
for (i = 0; i < num_boxes; ++i)
{
if (scores[i] > score_threshold)
{
candidate[select_size].index = i;
candidate[select_size].score = scores[i];
candidate[select_size].suppress_begin_index = 0;
select_size++;
}
}
num_outputs = vsi_nn_min(select_size, max_output_size);
if (num_outputs == 0)
{
num_selected_indices[0] = 0;
}
if (soft_nms_sigma > 0.0f)
{
scale = -0.5f / soft_nms_sigma;
}
select_len = 0;
while (select_len < num_outputs && select_start < select_size)
{
int32_t j = 0;
float original_score = 0;
vsi_bool should_hard_suppress = FALSE;
// find max score and swap to the front.
int32_t max_index = _max_element( &candidate[select_start], select_size - select_start);
if (max_index != select_size - select_start - 1)
{
_swap_element(&(candidate[select_start]), max_index, 0);
}
original_score = candidate[select_start].score;
// Calculate IoU of the rest, swap to the end (disgard) if needed.
for ( j = select_len - 1; j >= candidate[select_start].suppress_begin_index; j-- )
{
int32_t idx = (int32_t)selected_indices[j];
float iou = _computeIntersectionOverUnion(boxes, candidate[select_start].index, idx);
// First decide whether to perform hard suppression.
if (iou >= iou_threshold)
{
should_hard_suppress = TRUE;
break;
}
// Suppress score if NMS sigma > 0.
if (soft_nms_sigma > 0.0)
{
candidate[select_start].score =
candidate[select_start].score * (float)exp(scale * iou * iou);
}
if (candidate[select_start].score <= score_threshold)
break;
}
candidate[select_start].suppress_begin_index = select_len;
if (!should_hard_suppress)
{
if (candidate[select_start].score == original_score)
{
// Suppression has not occurred, so select next_candidate.
selected_indices[select_len] = (float)candidate[select_start].index;
selected_scores[select_len] = candidate[select_start].score;
++ select_len;
}
if ( candidate[select_start].score > score_threshold)
{
// Soft suppression might have occurred and current score is still
// greater than score_threshold; add next_candidate back onto priority
// queue.
candidate[select_start].suppress_begin_index = select_len;
}
}
select_start ++;
}
num_selected_indices[0] = (float)select_len;
for ( i = select_len; i < max_output_size; i++)
{
selected_indices[i] = 0;
selected_scores[i] = 0;
}
/* save data */
for ( i = 0; i < _OUTPUT_NUM; i++ )
{
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
f32_out_buffer[i], out_elements[i] );
CHECK_STATUS_FAIL_GOTO( status, final );
}
final:
vsi_nn_safe_free(candidate);
for( i = 0; i < _INPUT_NUM; i ++ )
{
if ( buffer[i] )
{
free( buffer[i] );
}
vsi_nn_kernel_tensor_attr_release( &attr[i] );
}
for ( i = 0; i < _OUTPUT_NUM; i++ )
{
if (f32_out_buffer[i])
{
free(f32_out_buffer[i]);
f32_out_buffer[i] = NULL;
}
if (out_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
}
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _nms_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _nms_kernel_param_def );
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_NMS_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
int32_t max_output_size = vsi_nn_kernel_param_get_int32(params, "max_output_size");
float iou_threshold = vsi_nn_kernel_param_get_float32(params, "iou_threshold");
float score_threshold = vsi_nn_kernel_param_get_float32(params, "score_threshold");
float soft_nms_sigma = vsi_nn_kernel_param_get_float32(params, "soft_nms_sigma");
status = _query_kernel( kernel, inputs, outputs );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _NMS_PARAM_NUM,
inputs, input_num, outputs, output_num );
/* Pass parameters to node. */
node_params[SCALAR_INPUT_MAX_SIZE] = vsi_nn_kernel_scalar_create(
graph, I32, &max_output_size );
node_params[SCALAR_INPUT_IOU_THRES] = vsi_nn_kernel_scalar_create(
graph, F32, &iou_threshold );
node_params[SCALAR_INPUT_SCORE_THRES] = vsi_nn_kernel_scalar_create(
graph, F32, &score_threshold );
node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] = vsi_nn_kernel_scalar_create(
graph, F32, &soft_nms_sigma );
status = vsi_nn_kernel_node_pass_param( node, node_params, _NMS_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MAX_SIZE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_IOU_THRES] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCORE_THRES] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( nms, _setup )

View File

@ -0,0 +1,252 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
#define _IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.one_hot")
/*
* Kernel params
*/
static vx_param_description_t _one_hot_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define INPUT_SCALAR_DEPTH (2)
#define INPUT_SCALAR_ON_VALUE (3)
#define INPUT_SCALAR_OFF_VALUE (4)
#define INPUT_SCALAR_AXIS (5)
#define _ONE_HOT_PARAM_NUM _cnt_of_array( _one_hot_kernel_param_def )
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
float * buffer[_IO_NUM] = { NULL };
size_t out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
int32_t i = 0;
int32_t j = 0;
int32_t k = 0;
int32_t index = 0;
int32_t depth = 0;
float on_value = 0;
float off_value = 0;
int32_t axis = 0;
int32_t prefix_dim_size = 1;
int32_t suffix_dim_size = 0;
int32_t num_elements = 0;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &depth);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &on_value);
status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &off_value);
status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
CHECK_STATUS_FAIL_GOTO(status, final );
num_elements = (int32_t)vsi_nn_kernel_tensor_attr_get_size( attr[0] );
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
buffer[1] = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
memset( buffer[1], 0, out_elements * sizeof(float) );
axis = axis == -1 ? (int32_t)attr[0]->shape->size : (int32_t)attr[0]->shape->size - axis;
for (i = 0; i < axis; i++)
{
prefix_dim_size *= attr[0]->shape->data[i];
}
suffix_dim_size = num_elements / prefix_dim_size;
for (i = 0; i < prefix_dim_size; i++)
{
for (j = 0; j < depth; j++)
{
for (k = 0; k < suffix_dim_size; k++)
{
int32_t value = (int32_t)buffer[0][i * suffix_dim_size + k];
buffer[1][index ++] = value == j ? on_value : off_value;
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
buffer[1], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(attr[0]);
SAFE_FREE_TENSOR_ATTR(attr[1]);
#undef SAFE_FREE_TENSOR_ATTR
for ( i = 0; i < _IO_NUM; i ++ )
{
if ( buffer[i] )
{
free( buffer[i] );
buffer[i] = NULL;
}
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _one_hot_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _one_hot_kernel_param_def );
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[INPUT_SCALAR_DEPTH] = vsi_nn_kernel_scalar_create(
graph, I32, &depth );
node_params[INPUT_SCALAR_ON_VALUE] = vsi_nn_kernel_scalar_create(
graph, F32, &on_value );
node_params[INPUT_SCALAR_OFF_VALUE] = vsi_nn_kernel_scalar_create(
graph, F32, &off_value );
node_params[INPUT_SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
graph, I32, &axis );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
CHECK_STATUS_FAIL_GOTO( status, OnError );
}
}
OnError:
if (node_params[INPUT_SCALAR_DEPTH])
{
vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_DEPTH] );
}
if (node_params[INPUT_SCALAR_ON_VALUE])
{
vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_ON_VALUE] );
}
if (node_params[INPUT_SCALAR_OFF_VALUE])
{
vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_OFF_VALUE] );
}
if (node_params[INPUT_SCALAR_AXIS])
{
vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_AXIS] );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( one_hot, _setup )

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -38,7 +38,7 @@
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -0,0 +1,286 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _CPU_ARG_NUM (1)
#define _CPU_INPUT_NUM (2)
#define _CPU_OUTPUT_NUM (1)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.repeat")
DEF_KERNEL_EXECUTOR(_repeat_exec)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
float * buffer[_CPU_IO_NUM] = { NULL };
size_t out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
int32_t i = 0, j = 0, b = 0, c = 0;
int32_t axis = 0;
int32_t outerSize = 1;
int32_t outIdx = 0;
int32_t width = 0, height = 0, channel = 0, batch = 0;
int32_t spatial = 0, vol = 0;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
CHECK_STATUS_FAIL_GOTO(status, final );
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input0 buffer fail.", final );
buffer[2] = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
memset( buffer[2], 0, out_elements * sizeof(float) );
width = attr[0]->shape->data[0];
height = attr[0]->shape->data[1];
channel = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
spatial = width * height;
vol = spatial * channel;
for(i = 1; i < (int32_t)attr[0]->shape->size; i++)
{
outerSize *= attr[0]->shape->data[i];
}
if (axis == 0 && outerSize == 1)
{
for(i = 0; i < width; i++)
{
float data = buffer[0][i];
int32_t len = (int32_t)buffer[1][i];
for(j = 0; j < len; j++)
{
buffer[2][outIdx] = data;
}
}
}
else if (axis == 0)
{
for(b = 0; b < batch; b++)
{
for(c = 0; c < channel; c++)
{
for(i = 0; i < height; i++)
{
int32_t len = (int32_t)buffer[1][i];
int32_t offset = i * width + c * spatial + b * vol;
for(j = 0; j < len; j++)
{
memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * width);
outIdx += width;
}
}
}
}
}
else if (axis == 1)
{
for(b = 0; b < batch; b++)
{
for(c = 0; c < channel; c++)
{
for(i = 0; i < height; i++)
{
int32_t offset = i * width + c * spatial + b * vol;
for(j = 0; j < width; j++)
{
int32_t len = (int32_t)buffer[1][j];
float data = buffer[0][offset + j];
int32_t k = 0;
for(k = 0; k < len; k++)
{
buffer[2][outIdx++] = data;
}
}
}
}
}
}
else if (axis == 2)
{
for(b = 0; b < batch; b++)
{
for(c = 0; c < channel; c++)
{
int32_t len = (int32_t)buffer[1][c];
int32_t offset = c * spatial + b * vol;
for(j = 0; j < len; j++)
{
memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * spatial);
outIdx += spatial;
}
}
}
}
else
{
VSILOGE("axis is not support");
status = VSI_FAILURE;
goto final;
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
buffer[2], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if( buffer[i] )
{
free( buffer[i] );
}
}
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
}
return status;
} /* _repeat_exec() */
/*
* Kernel params
*/
static vx_param_description_t _repeat_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _REPEAT_PARAM_NUM _cnt_of_array( _repeat_kernel_param_def )
static const vx_kernel_description_t _kernel_info =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME,
_repeat_exec,
_repeat_kernel_param_def,
_cnt_of_array( _repeat_kernel_param_def ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel
)
{
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
return VSI_SUCCESS;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
status = _query_kernel( inputs, outputs, kernel );
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
backend_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
CHECK_STATUS( status );
vsi_nn_kernel_scalar_release( &backend_params[3] );
}
else
{
status = VSI_FAILURE;
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( repeat, _setup )

View File

@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -0,0 +1,248 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "utils/vsi_nn_dtype_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS
#define _CPU_ARG_NUM (1)
#define _CPU_INPUT_NUM (1)
#define _CPU_OUTPUT_NUM (1)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
#define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("sequence_mask_sw")
DEF_KERNEL_EXECUTOR(_sequence_mask_exec)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VX_SUCCESS;
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
float * buffer_in = NULL;
float * buffer = NULL;
size_t out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
uint32_t i = 0;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
buffer_in = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer_in, "Create input0 buffer fail.", final );
buffer = (float *)malloc( out_elements * sizeof(float) );
CHECK_PTR_FAIL_GOTO( buffer, "Create output buffer fail.", final );
memset( buffer, 0, out_elements * sizeof(float) );
{
uint32_t j = 0;
uint32_t height = attr[1]->shape->data[1];
uint32_t width = attr[1]->shape->data[0];
for(j = 0; j < height; j++)
{
uint32_t idx_in = (uint32_t)buffer_in[j];
uint32_t out_offset = j * width;
idx_in = idx_in > width ? width : idx_in;
for(i = 0; i < idx_in; i++)
{
buffer[out_offset + i] = 1;
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
buffer, out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
if (buffer_in)
{
free( buffer_in );
}
if (buffer)
{
free( buffer );
}
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
vsi_nn_kernel_tensor_attr_release( &attr[i] );
}
return status;
} /* _sequence_mask_exec() */
static vx_param_description_t kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
static const vx_kernel_description_t _kernel_info =
{
KERNEL_ID_PLACEHOLDER,
_KERNEL_NAME,
_sequence_mask_exec,
kernel_param_def,
_cnt_of_array( kernel_param_def ),
vsi_nn_KernelValidator,
NULL,
NULL,
vsi_nn_KernelInitializer,
vsi_nn_KernelDeinitializer
};
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel
)
{
memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
return VSI_SUCCESS;
} /* _query_kernel() */
static int32_t _optimize_mask_shape
(
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
int32_t max_len,
int32_t* opt_shape_in,
int32_t* opt_shape_out
)
{
vsi_status status = VSI_SUCCESS;
int32_t out_size = 1;
uint32_t i = 0;
opt_shape_in[0] = 1;
opt_shape_in[1] = 1;
for(i = 0; i < inputs[0]->attr.dim_num; i++)
{
opt_shape_in[0] *= inputs[0]->attr.size[i];
}
for(i = 0; i < outputs[0]->attr.dim_num; i++)
{
out_size *= outputs[0]->attr.size[i];
}
opt_shape_out[0] = max_len;
opt_shape_out[1] = out_size / max_len;
if (out_size % max_len != 0)
{
return VSI_FAILURE;
}
return status;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_SUCCESS;
vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }};
int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" );
status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1]);
if ( VSI_SUCCESS != status )
{
goto final;
}
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2);
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 2);
status = _query_kernel( inputs, outputs, kernel );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
uint32_t index = 0;
/* Pass parameters to node. */
backend_params[index++] = rs_input;
backend_params[index++] = rs_output;
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &backend_params[2] );
}
else
{
status = VSI_FAILURE;
}
}
final:
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( sequence_mask, _setup )

View File

@ -0,0 +1,246 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (2)
#define _OUTPUT_NUM (1)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.slice")
/*
* Kernel params
*/
static vx_param_description_t _slice_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _SLICE_PARAM_NUM _cnt_of_array( _slice_kernel_param_def )
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
float *f32_in_buffer[_INPUT_NUM] = {NULL};
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
size_t out_elements[_OUTPUT_NUM] = {0};
size_t out_bytes[_OUTPUT_NUM] = {0};
int32_t rank = 0;
int32_t i = 0;
int32_t in_w = 0;
int32_t in_h = 0;
int32_t in_c = 0;
int32_t in_b = 0;
int32_t start[4] = {0};
int32_t stop[4] = {0};
int32_t in_size[4] = {1, 1, 1, 1};
int32_t out_size[4] = {1, 1, 1, 1};
float *input_ptr = NULL;
float *output_ptr = NULL;
int32_t dstIdx = 0;
/* prepare data */
for (i = 0; i < _INPUT_NUM; i ++)
{
input[i] = (vsi_nn_kernel_tensor_t)param[i];
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
}
for (i = 0; i < _OUTPUT_NUM; i ++)
{
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
out_bytes[i] = out_elements[i] * sizeof(float);
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
memset( f32_out_buffer[i], 0, out_bytes[i] );
}
rank = (int32_t)out_attr[0]->shape->size;
for (i = 0; i < rank; i++)
{
in_size[i] = in_attr[0]->shape->data[i];
out_size[i] = out_attr[0]->shape->data[i];
}
start[0] = (int32_t)f32_in_buffer[1][0];
stop[0] = start[0] + out_attr[0]->shape->data[0];
start[1] = rank < 2 ? 0 : (int32_t)f32_in_buffer[1][1];
stop[1] = rank < 2 ? 1 : start[1] + out_size[1];
start[2] = rank < 3 ? 0 : (int32_t)f32_in_buffer[1][2];
stop[2] = rank < 3 ? 1 : start[2] + out_size[2];
start[3] = rank < 4 ? 0 : (int32_t)f32_in_buffer[1][3];
stop[3] = rank < 4 ? 1 : start[3] + out_size[3];
input_ptr = f32_in_buffer[0];
output_ptr = f32_out_buffer[0];
for (in_b = start[3]; in_b < stop[3]; ++in_b)
{
for (in_c = start[2]; in_c < stop[2]; ++in_c)
{
for (in_h = start[1]; in_h < stop[1]; ++in_h)
{
for (in_w = start[0]; in_w < stop[0]; ++in_w)
{
int32_t srcIdx = ((in_b * in_size[2] + in_c) * in_size[1] + in_h) * in_size[0] + in_w;
output_ptr[dstIdx ++] = input_ptr[srcIdx];
}
}
}
}
/* save data */
for(i = 0; i < _OUTPUT_NUM; i++)
{
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
f32_out_buffer[i], out_elements[i] );
CHECK_STATUS_FAIL_GOTO( status, final );
}
final:
for (i = 0; i < _INPUT_NUM; i++)
{
if (f32_in_buffer[i])
{
free(f32_in_buffer[i]);
f32_in_buffer[i] = NULL;
}
if (in_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
}
}
for (i = 0; i < _OUTPUT_NUM; i++)
{
if (f32_out_buffer[i])
{
free(f32_out_buffer[i]);
f32_out_buffer[i] = NULL;
}
if (out_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
}
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _slice_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _slice_kernel_param_def );
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM,
inputs, input_num, outputs, output_num );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( slice, _setup )

View File

@ -35,7 +35,7 @@
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -32,7 +32,7 @@
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "client/vsi_nn_vxkernel.h"
#include "libnnext/vsi_nn_vxkernel.h"
__BEGIN_DECLS

View File

@ -0,0 +1,297 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (2)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.topk")
/*
* Kernel params
*/
static vx_param_description_t _topk_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
// Add kererl parameters here
};
#define _TOPK_PARAM_NUM _cnt_of_array( _topk_kernel_param_def )
static uint32_t _max_comp_func(void* data, int32_t left, int32_t right)
{
float* fdata = (float*)data;
if (fdata[left] >= fdata[right])
{
return TRUE;
}
else
{
return FALSE;
}
}
static void _find_top_k_1d
(
float* input,
uint32_t input_len,
uint32_t k,
float* value,
uint32_t* indices
)
{
int32_t low = 0;
int32_t high = input_len - 1;
int32_t j;
for (j = 0; j < (int32_t)input_len; j++)
{
indices[j] = j;
}
j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices);
//part_sort
while (j != (int32_t)k)
{
if ((int32_t)k > j)
{
low = j + 1;
}
else
{
high = j;
}
j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices);
}
//all_sort
vsi_nn_partition(input, 0, k - 1, _max_comp_func, TRUE, indices);
for (j = 0; j < (int32_t)k; j++)
{
value[j] = input[indices[j]];
}
}
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
float *f32_in_buffer[_INPUT_NUM] = {NULL};
float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
size_t out_elements[_OUTPUT_NUM] = {0};
size_t out_bytes[_OUTPUT_NUM] = {0};
uint32_t i = 0;
int32_t j = 0;
int32_t top_k = 0;
uint32_t block_num = 0;
uint32_t block_size = 0;
uint32_t * indices_ptr = NULL;
/* prepare data */
for (i = 0; i < _INPUT_NUM; i ++)
{
input[i] = (vsi_nn_kernel_tensor_t)param[i];
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
}
for (i = 0; i < _OUTPUT_NUM; i ++)
{
output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
out_bytes[i] = out_elements[i] * sizeof(float);
f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
memset( f32_out_buffer[i], 0, out_bytes[i] );
}
status = vsi_nn_kernel_scalar_read_int32( param[3], &top_k );
CHECK_STATUS_FAIL_GOTO(status, final );
block_num = in_attr[0]->shape->data[1];
block_size = in_attr[0]->shape->data[0];
indices_ptr = (uint32_t*)malloc(block_size * sizeof(uint32_t));
CHECK_PTR_FAIL_GOTO( indices_ptr, "Create indices buffer fail.", final );
for(i = 0; i < block_num; i++)
{
uint32_t in_index = i * block_size;
uint32_t out_index = i * top_k;
_find_top_k_1d(&(f32_in_buffer[0][in_index]),
block_size, top_k, &(f32_out_buffer[0][out_index]), indices_ptr);
for (j = 0; j < top_k; j++)
{
f32_out_buffer[1][out_index + j] = (float)indices_ptr[j];
}
}
// Handle the 1D input
if (!block_num)
{
_find_top_k_1d(&(f32_in_buffer[0][0]),
block_size, top_k, &(f32_out_buffer[0][0]), indices_ptr);
for (j = 0; j < top_k; j++)
{
f32_out_buffer[1][j] = (float)indices_ptr[j];
}
}
/* save data */
for(i = 0; i < _OUTPUT_NUM; i++)
{
status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
f32_out_buffer[i], out_elements[i] );
CHECK_STATUS_FAIL_GOTO( status, final );
}
final:
vsi_nn_safe_free(indices_ptr);
for (i = 0; i < _INPUT_NUM; i++)
{
if (f32_in_buffer[i])
{
free(f32_in_buffer[i]);
f32_in_buffer[i] = NULL;
}
if (in_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
}
}
for (i = 0; i < _OUTPUT_NUM; i++)
{
if (f32_out_buffer[i])
{
free(f32_out_buffer[i]);
f32_out_buffer[i] = NULL;
}
if (out_attr[i])
{
vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
}
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _topk_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _topk_kernel_param_def );
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &top_k );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[3] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( topk, _setup )

View File

@ -44,23 +44,26 @@ typedef enum _internal_img_dim_e
IMAGE_2D,
} internal_img_dim_e;
#define _BATCH_NORM_KERNEL_SOURCE "batchnorm_single"
#define SOURCE0 "batchnorm_single"
#define SOURCE1 "batchnorm_single_f32"
#define STR(a) #a
// Add kernel hashtable here
#define BATCH_NORM_HASH_KEY(IN_DTYPE, OUT_DTYPE, BRDCST, _image_2d) \
( ( IN_DTYPE << 16 ) | ( OUT_DTYPE << 3) | ( BRDCST << 1) | (_image_2d) )
#define BATCH_NORM_HASH_KEY(IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, _image_2d) \
( ( IN_DTYPE << 24 ) | ( GAMMA_TYPE << 16 ) | ( OUT_DTYPE << 3) | ( BRDCST << 1) | (_image_2d) )
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, BRDCST) \
{ BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE), \
CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST), \
_BATCH_NORM_KERNEL_SOURCE}
#define PACK_KERNEL_MAP( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, source) \
{ BATCH_NORM_HASH_KEY( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, IMAGE), \
CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"_F16_F16_" \
STR(GAMMA_TYPE)"_F32to"STR(OUT_DTYPE)"_brdcst"#BRDCST), \
source}
#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE, BRDCST) \
{ BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE_2D), \
CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST"_2D"), \
_BATCH_NORM_KERNEL_SOURCE}
#define PACK_KERNEL_MAP_2D( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, source) \
{ BATCH_NORM_HASH_KEY( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, IMAGE_2D), \
CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"_F16_F16_" \
STR(GAMMA_TYPE)"_F32to"STR(OUT_DTYPE)"_brdcst"#BRDCST"_2D"), \
source}
typedef struct
{
@ -71,47 +74,89 @@ typedef struct
static const _kernel_map_type _batch_norm_kernel_map[] =
{
PACK_KERNEL_MAP(F16, F16, 0),
PACK_KERNEL_MAP(F16, I16, 0),
PACK_KERNEL_MAP(F16, U8, 0),
PACK_KERNEL_MAP(F16, I8, 0),
PACK_KERNEL_MAP(U8, U8, 0),
PACK_KERNEL_MAP(U8, F16, 0),
PACK_KERNEL_MAP(I8, I8, 0),
PACK_KERNEL_MAP(I8, F16, 0),
PACK_KERNEL_MAP(I16, I16, 0),
PACK_KERNEL_MAP(I16, F16, 0),
PACK_KERNEL_MAP(F16, F16, 1),
PACK_KERNEL_MAP(F16, I16, 1),
PACK_KERNEL_MAP(F16, U8, 1),
PACK_KERNEL_MAP(F16, I8, 1),
PACK_KERNEL_MAP(U8, U8, 1),
PACK_KERNEL_MAP(U8, F16, 1),
PACK_KERNEL_MAP(I8, I8, 1),
PACK_KERNEL_MAP(I8, F16, 1),
PACK_KERNEL_MAP(I16, I16, 1),
PACK_KERNEL_MAP(I16, F16, 1),
PACK_KERNEL_MAP(F16, F16, F16, 0, SOURCE0),
PACK_KERNEL_MAP(F16, F16, I16, 0, SOURCE0),
PACK_KERNEL_MAP(F16, F16, U8, 0, SOURCE0),
PACK_KERNEL_MAP(F16, F16, I8, 0, SOURCE0),
PACK_KERNEL_MAP(U8, F16, U8, 0, SOURCE0),
PACK_KERNEL_MAP(U8, F16, F16, 0, SOURCE0),
PACK_KERNEL_MAP(I8, F16, I8, 0, SOURCE0),
PACK_KERNEL_MAP(I8, F16, F16, 0, SOURCE0),
PACK_KERNEL_MAP(I16, F16, I16, 0, SOURCE0),
PACK_KERNEL_MAP(I16, F16, F16, 0, SOURCE0),
PACK_KERNEL_MAP(F16, F16, F16, 1, SOURCE0),
PACK_KERNEL_MAP(F16, F16, I16, 1, SOURCE0),
PACK_KERNEL_MAP(F16, F16, U8, 1, SOURCE0),
PACK_KERNEL_MAP(F16, F16, I8, 1, SOURCE0),
PACK_KERNEL_MAP(U8, F16, U8, 1, SOURCE0),
PACK_KERNEL_MAP(U8, F16, F16, 1, SOURCE0),
PACK_KERNEL_MAP(I8, F16, I8, 1, SOURCE0),
PACK_KERNEL_MAP(I8, F16, F16, 1, SOURCE0),
PACK_KERNEL_MAP(I16, F16, I16, 1, SOURCE0),
PACK_KERNEL_MAP(I16, F16, F16, 1, SOURCE0),
PACK_KERNEL_MAP_2D(F16, F16, 0),
PACK_KERNEL_MAP_2D(F16, I16, 0),
PACK_KERNEL_MAP_2D(F16, U8 , 0),
PACK_KERNEL_MAP_2D(F16, I8 , 0),
PACK_KERNEL_MAP_2D(U8, U8 , 0),
PACK_KERNEL_MAP_2D(U8, F16, 0),
PACK_KERNEL_MAP_2D(I8, I8, 0),
PACK_KERNEL_MAP_2D(I8, F16, 0),
PACK_KERNEL_MAP_2D(I16, I16, 0),
PACK_KERNEL_MAP_2D(I16, F16, 0),
PACK_KERNEL_MAP_2D(F16, F16, 1),
PACK_KERNEL_MAP_2D(F16, I16, 1),
PACK_KERNEL_MAP_2D(F16, U8 , 1),
PACK_KERNEL_MAP_2D(F16, I8 , 1),
PACK_KERNEL_MAP_2D(U8, U8 , 1),
PACK_KERNEL_MAP_2D(U8, F16, 1),
PACK_KERNEL_MAP_2D(I8, I8, 1),
PACK_KERNEL_MAP_2D(I8, F16, 1),
PACK_KERNEL_MAP_2D(I16, I16, 1),
PACK_KERNEL_MAP_2D(I16, F16, 1),
PACK_KERNEL_MAP(F16, F32, F16, 0, SOURCE1),
PACK_KERNEL_MAP(F16, F32, I16, 0, SOURCE1),
PACK_KERNEL_MAP(F16, F32, U8, 0, SOURCE1),
PACK_KERNEL_MAP(F16, F32, I8, 0, SOURCE1),
PACK_KERNEL_MAP(U8, F32, U8, 0, SOURCE1),
PACK_KERNEL_MAP(U8, F32, F16, 0, SOURCE1),
PACK_KERNEL_MAP(I8, F32, I8, 0, SOURCE1),
PACK_KERNEL_MAP(I8, F32, F16, 0, SOURCE1),
PACK_KERNEL_MAP(I16, F32, I16, 0, SOURCE1),
PACK_KERNEL_MAP(I16, F32, F16, 0, SOURCE1),
PACK_KERNEL_MAP(F16, F32, F16, 1, SOURCE1),
PACK_KERNEL_MAP(F16, F32, I16, 1, SOURCE1),
PACK_KERNEL_MAP(F16, F32, U8, 1, SOURCE1),
PACK_KERNEL_MAP(F16, F32, I8, 1, SOURCE1),
PACK_KERNEL_MAP(U8, F32, U8, 1, SOURCE1),
PACK_KERNEL_MAP(U8, F32, F16, 1, SOURCE1),
PACK_KERNEL_MAP(I8, F32, I8, 1, SOURCE1),
PACK_KERNEL_MAP(I8, F32, F16, 1, SOURCE1),
PACK_KERNEL_MAP(I16, F32, I16, 1, SOURCE1),
PACK_KERNEL_MAP(I16, F32, F16, 1, SOURCE1),
PACK_KERNEL_MAP_2D(F16, F16, F16, 0, SOURCE0),
PACK_KERNEL_MAP_2D(F16, F16, I16, 0, SOURCE0),
PACK_KERNEL_MAP_2D(F16, F16, U8, 0, SOURCE0),
PACK_KERNEL_MAP_2D(F16, F16, I8, 0, SOURCE0),
PACK_KERNEL_MAP_2D(U8, F16, U8, 0, SOURCE0),
PACK_KERNEL_MAP_2D(U8, F16, F16, 0, SOURCE0),
PACK_KERNEL_MAP_2D(I8, F16, I8, 0, SOURCE0),
PACK_KERNEL_MAP_2D(I8, F16, F16, 0, SOURCE0),
PACK_KERNEL_MAP_2D(I16, F16, I16, 0, SOURCE0),
PACK_KERNEL_MAP_2D(I16, F16, F16, 0, SOURCE0),
PACK_KERNEL_MAP_2D(F16, F16, F16, 1, SOURCE0),
PACK_KERNEL_MAP_2D(F16, F16, I16, 1, SOURCE0),
PACK_KERNEL_MAP_2D(F16, F16, U8, 1, SOURCE0),
PACK_KERNEL_MAP_2D(F16, F16, I8, 1, SOURCE0),
PACK_KERNEL_MAP_2D(U8, F16, U8, 1, SOURCE0),
PACK_KERNEL_MAP_2D(U8, F16, F16, 1, SOURCE0),
PACK_KERNEL_MAP_2D(I8, F16, I8, 1, SOURCE0),
PACK_KERNEL_MAP_2D(I8, F16, F16, 1, SOURCE0),
PACK_KERNEL_MAP_2D(I16, F16, I16, 1, SOURCE0),
PACK_KERNEL_MAP_2D(I16, F16, F16, 1, SOURCE0),
PACK_KERNEL_MAP_2D(F16, F32, F16, 0, SOURCE1),
PACK_KERNEL_MAP_2D(F16, F32, I16, 0, SOURCE1),
PACK_KERNEL_MAP_2D(F16, F32, U8, 0, SOURCE1),
PACK_KERNEL_MAP_2D(F16, F32, I8, 0, SOURCE1),
PACK_KERNEL_MAP_2D(U8, F32, U8, 0, SOURCE1),
PACK_KERNEL_MAP_2D(U8, F32, F16, 0, SOURCE1),
PACK_KERNEL_MAP_2D(I8, F32, I8, 0, SOURCE1),
PACK_KERNEL_MAP_2D(I8, F32, F16, 0, SOURCE1),
PACK_KERNEL_MAP_2D(I16, F32, I16, 0, SOURCE1),
PACK_KERNEL_MAP_2D(I16, F32, F16, 0, SOURCE1),
PACK_KERNEL_MAP_2D(F16, F32, F16, 1, SOURCE1),
PACK_KERNEL_MAP_2D(F16, F32, I16, 1, SOURCE1),
PACK_KERNEL_MAP_2D(F16, F32, U8, 1, SOURCE1),
PACK_KERNEL_MAP_2D(F16, F32, I8, 1, SOURCE1),
PACK_KERNEL_MAP_2D(U8, F32, U8, 1, SOURCE1),
PACK_KERNEL_MAP_2D(U8, F32, F16, 1, SOURCE1),
PACK_KERNEL_MAP_2D(I8, F32, I8, 1, SOURCE1),
PACK_KERNEL_MAP_2D(I8, F32, F16, 1, SOURCE1),
PACK_KERNEL_MAP_2D(I16, F32, I16, 1, SOURCE1),
PACK_KERNEL_MAP_2D(I16, F32, F16, 1, SOURCE1),
};
/*
@ -329,6 +374,7 @@ static vsi_status _query_kernel
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e gamma_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _batch_norm_kernel_map;
size_t kernel_map_size = _cnt_of_array( _batch_norm_kernel_map );
@ -340,6 +386,7 @@ static vsi_status _query_kernel
uint32_t brdcst = 0;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
gamma_dtype = vsi_nn_kernel_map_dtype( inputs[3]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (inputs[BATCHNORM_INPUT]->attr.size[0] != 1 && inputs[BATCHNORM_INPUT_BETA]->attr.size[0] == 1)
@ -347,7 +394,7 @@ static vsi_status _query_kernel
brdcst = 1;
}
key = BATCH_NORM_HASH_KEY(in_dtype, out_dtype, brdcst, image_2d);
key = BATCH_NORM_HASH_KEY(in_dtype, gamma_dtype, out_dtype, brdcst, image_2d);
for( i = 0; i < kernel_map_size; i ++ )
{
@ -397,7 +444,6 @@ static vsi_nn_kernel_node_t _setup
if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const)
|| (inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
|| (inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
|| (inputs[3]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
|| (inputs[4]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32) )
{
return NULL;

View File

@ -241,6 +241,7 @@ static vsi_status _query_kernel
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in_dtype = in_dtype == BOOL8 ? I8 : in_dtype;
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = CAST_HASH_KEY( in_dtype, out_dtype, image_2d );

View File

@ -455,6 +455,7 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
output_dtype = output_dtype == I8 ? BOOL8 : output_dtype;
key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d );
for( i = 0; i < _cnt_of_array(_comparisons_evis_kernel_map); i ++ )

View File

@ -0,0 +1,702 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum
{
NORMAL = 0,
K3_S1,
K3_S1_D2_D4,
K1024_SMALL,
K1024_LARGE,
} _internal_kernel_e;
#define _CONV1D_OVXLIB_KERNEL_SOURCE "conv1d_ovxlib"
#define _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 "conv1d_ovxlib_k1024"
#define STR(a) #a
// Add kernel hashtable here
#define CONV1D_OVXLIB_HASH_KEY( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE ) \
(( KERNEL_TYPE << 24 ) | ( IN_DTYPE << 18 ) | ( W_DTYPE << 12 ) | ( B_DTYPE << 6 ) | ( OUT_DTYPE ))
#define PACK_KERNEL_MAP( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE, SOURCE ) \
{ CONV1D_OVXLIB_HASH_KEY( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE ), \
CVIVANTE_NAMESPACE(\
"evis.conv1d_"STR(IN_DTYPE)STR(W_DTYPE)STR(B_DTYPE)"to"STR(OUT_DTYPE)"_"STR(KERNEL_TYPE)), \
SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _conv1d_ovxlib_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( U8, U8, I32, U8, K3_S1, _CONV1D_OVXLIB_KERNEL_SOURCE ),
PACK_KERNEL_MAP( U8, U8, I32, U8, K3_S1_D2_D4, _CONV1D_OVXLIB_KERNEL_SOURCE ),
PACK_KERNEL_MAP( U8, U8, I32, U8, K1024_SMALL, _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 ),
PACK_KERNEL_MAP( U8, U8, I32, U8, K1024_LARGE, _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 ),
};
/*
* Kernel params
*/
static vx_param_description_t _conv1d_ovxlib_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CONV1D_OVXLIB_PARAM_NUM _cnt_of_array( _conv1d_ovxlib_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_nn_kernel_tensor_attr_t * weights_attr = NULL;
vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
vsi_int_array_t * in_shape = NULL;
vsi_int_array_t * out_shape = NULL;
vsi_int_array_t * weight_shape = NULL;
float scaleIn = 1.0f;
float scaleOut = 1.0f;
float scaleWights = 1.0f;
int32_t input_ZP = 0;
int32_t weight_ZP = 0;
float output_ZP = 0;
int32_t stride = 1;
int32_t dilation = 0;
int32_t input_height = 0;
int32_t input_width = 0;
int32_t output_width = 0;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
weights_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( weights_attr, "Create tensor attr buffer fail.", final );
output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(stride));
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &(dilation));
in_shape = input_attr->shape;
out_shape = output_attr->shape;
weight_shape = weights_attr->shape;
if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
{
input_ZP = input_attr->asymm.zero_point;
scaleIn = input_attr->asymm.scale;
}
if ( VSI_NN_KERNEL_QUANT_ASYMM == weights_attr->quant )
{
weight_ZP = weights_attr->asymm.zero_point;
scaleWights = weights_attr->asymm.scale;
}
if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
{
output_ZP = (float)output_attr->asymm.zero_point;
scaleOut = output_attr->asymm.scale;
}
scaleOut = (scaleIn * scaleWights) / scaleOut;
input_height = in_shape->data[1];
input_width = in_shape->data[0];
output_width = out_shape->data[0];
if ((U8 == input_attr->dtype) && (U8 == weights_attr->dtype) && (U8 == output_attr->dtype))
{
gpu_dp_inst_t uniSumOrderUchar_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x0c080400, 0x0c080400, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
if ( (3 == weight_shape->data[0]) && (1 == stride) )
{
gpu_dp_inst_t uniConv1DK3_Lo0_4x4 = {{
0x69696969, // TCfg
0x44444444, // ASelt
0x41014000, 0x43034202, // ABin
0x55555555, // BSelt
0x55405540, 0x55405540, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConv1DK3_Lo1_4x4 = {{
0x69696969, // TCfg
0x44444444, // ASelt
0x41114010, 0x43134212, // ABin
0x55555555, // BSelt
0x55415541, 0x55415541, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConv1DK3_Lo2_4x4 = {{
0x69696969, // TCfg
0x44444444, // ASelt
0x41214020, 0x43234222, // ABin
0x55555555, // BSelt
0x55425542, 0x55425542, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConv1DK3_Hi0_4x4 = {{
0x69696969, // TCfg
0x44444444, // ASelt
0x45054404, 0x47074606, // ABin
0x55555555, // BSelt
0x55405540, 0x55405540, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConv1DK3_Hi1_4x4 = {{
0x69696969, // TCfg
0x44444444, // ASelt
0x45154414, 0x47174616, // ABin
0x55555555, // BSelt
0x55415541, 0x55415541, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConv1DK3_Hi2_4x4 = {{
0x69696969, // TCfg
0x44444444, // ASelt
0x45254424, 0x47274626, // ABin
0x55555555, // BSelt
0x55425542, 0x55425542, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniDataConvK3_2x8 = {{
0x00111111, // TCfg
0x00110000, // ASelt
0x03020100, 0x00000504, // ABin
0x00222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
uint32_t conv1dK3D2_Lo1[4] = {0x43134212, 0x45154414, 0x55415541, 0x55415541};
uint32_t conv1dK3D2_Lo2[4] = {0x45254424, 0x47274626, 0x55425542, 0x55425542};
uint32_t conv1dK3D2_Hi1[4] = {0x47174616, 0x49194818, 0x55415541, 0x55415541};
uint32_t conv1dK3D2_Hi2[4] = {0x49294828, 0x4b2b4a2a, 0x55425542, 0x55425542};
uint32_t conv1dK3D4_Lo1[4] = {0x45154414, 0x47174616, 0x55415541, 0x55415541};
uint32_t conv1dK3D4_Lo2[4] = {0x49294828, 0x4b2b4a2a, 0x55425542, 0x55425542};
uint32_t conv1dK3D4_Hi1[4] = {0x49194818, 0x4b1b4a1a, 0x55415541, 0x55415541};
uint32_t conv1dK3D4_Hi2[4] = {0x4d2d4c2c, 0x4f2f4e2e, 0x55425542, 0x55425542};
if (2 == dilation)
{
uniConv1DK3_Lo1_4x4.data[2] = conv1dK3D2_Lo1[0];
uniConv1DK3_Lo1_4x4.data[3] = conv1dK3D2_Lo1[1];
uniConv1DK3_Lo1_4x4.data[5] = conv1dK3D2_Lo1[2];
uniConv1DK3_Lo1_4x4.data[6] = conv1dK3D2_Lo1[3];
uniConv1DK3_Lo2_4x4.data[2] = conv1dK3D2_Lo2[0];
uniConv1DK3_Lo2_4x4.data[3] = conv1dK3D2_Lo2[1];
uniConv1DK3_Lo2_4x4.data[5] = conv1dK3D2_Lo2[2];
uniConv1DK3_Lo2_4x4.data[6] = conv1dK3D2_Lo2[3];
uniConv1DK3_Hi1_4x4.data[2] = conv1dK3D2_Hi1[0];
uniConv1DK3_Hi1_4x4.data[3] = conv1dK3D2_Hi1[1];
uniConv1DK3_Hi1_4x4.data[5] = conv1dK3D2_Hi1[2];
uniConv1DK3_Hi1_4x4.data[6] = conv1dK3D2_Hi1[3];
uniConv1DK3_Hi2_4x4.data[2] = conv1dK3D2_Hi2[0];
uniConv1DK3_Hi2_4x4.data[3] = conv1dK3D2_Hi2[1];
uniConv1DK3_Hi2_4x4.data[5] = conv1dK3D2_Hi2[2];
uniConv1DK3_Hi2_4x4.data[6] = conv1dK3D2_Hi2[3];
}
else if (4 == dilation)
{
uniConv1DK3_Lo1_4x4.data[2] = conv1dK3D4_Lo1[0];
uniConv1DK3_Lo1_4x4.data[3] = conv1dK3D4_Lo1[1];
uniConv1DK3_Lo1_4x4.data[5] = conv1dK3D4_Lo1[2];
uniConv1DK3_Lo1_4x4.data[6] = conv1dK3D4_Lo1[3];
uniConv1DK3_Lo2_4x4.data[2] = conv1dK3D4_Lo2[0];
uniConv1DK3_Lo2_4x4.data[3] = conv1dK3D4_Lo2[1];
uniConv1DK3_Lo2_4x4.data[5] = conv1dK3D4_Lo2[2];
uniConv1DK3_Lo2_4x4.data[6] = conv1dK3D4_Lo2[3];
uniConv1DK3_Hi1_4x4.data[2] = conv1dK3D4_Hi1[0];
uniConv1DK3_Hi1_4x4.data[3] = conv1dK3D4_Hi1[1];
uniConv1DK3_Hi1_4x4.data[5] = conv1dK3D4_Hi1[2];
uniConv1DK3_Hi1_4x4.data[6] = conv1dK3D4_Hi1[3];
uniConv1DK3_Hi2_4x4.data[2] = conv1dK3D4_Hi2[0];
uniConv1DK3_Hi2_4x4.data[3] = conv1dK3D4_Hi2[1];
uniConv1DK3_Hi2_4x4.data[5] = conv1dK3D4_Hi2[2];
uniConv1DK3_Hi2_4x4.data[6] = conv1dK3D4_Hi2[3];
}
status = vsi_nn_kernel_gpu_add_param( node,
"uniConv1DK3_Lo0_4x4", &uniConv1DK3_Lo0_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConv1DK3_Hi0_4x4", &uniConv1DK3_Hi0_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConv1DK3_Lo1_4x4", &uniConv1DK3_Lo1_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConv1DK3_Lo2_4x4", &uniConv1DK3_Lo2_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConv1DK3_Hi1_4x4", &uniConv1DK3_Hi1_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConv1DK3_Hi2_4x4", &uniConv1DK3_Hi2_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniDataConvK3_2x8", &uniDataConvK3_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumOrderUchar_2x8", &uniSumOrderUchar_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &input_ZP);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if ( (1024 == weight_shape->data[0]) && (1 == stride) )
{
gpu_dp_inst_t uniU8SubZp_lo_2x8= {{
0x99999999, // TCfg
0x44444444, // ASelt
0x03020100, 0x07060504, // ABin
0xaaaaaaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001,
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniU8SubZp_hi_2x8= {{
0x99999999, // TCfg
0x44444444, // ASelt
0x0b0a0908, 0x0f0e0d0c, // ABin
0xaaaaaaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001,
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniU8Conv1d_part0_8x2= {{
0x55555555, // TCfg
0x00000000, // ASelt
0x76543210, 0x87654321, // ABin
0x55555555, // BSelt
0x76543210, 0x76543210, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniU8Conv1d_part1_8x2= {{
0x55555555, // TCfg
0x00000000, // ASelt
0x98765432, 0xa9876543, // ABin
0x55555555, // BSelt
0x76543210, 0x76543210, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniU8Conv1d_part2_8x2= {{
0x55555555, // TCfg
0x00000000, // ASelt
0xba987654, 0xcba98765, // ABin
0x55555555, // BSelt
0x76543210, 0x76543210, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniU8Conv1d_part3_8x2= {{
0x55555555, // TCfg
0x00000000, // ASelt
0xdcba9876, 0xedcba987, // ABin
0x55555555, // BSelt
0x76543210, 0x76543210, // BBin
0x00000400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
int32_t kernel_cnt_x16 = (weight_shape->data[0] + 15) / 16;
status = vsi_nn_kernel_gpu_add_param( node,
"kernel_cnt_x16", &kernel_cnt_x16 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8SubZp_lo_2x8", &uniU8SubZp_lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8SubZp_hi_2x8", &uniU8SubZp_hi_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8Conv1d_part0_8x2", &uniU8Conv1d_part0_8x2 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8Conv1d_part1_8x2", &uniU8Conv1d_part1_8x2 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8Conv1d_part2_8x2", &uniU8Conv1d_part2_8x2 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8Conv1d_part3_8x2", &uniU8Conv1d_part3_8x2 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniSumOrderUchar_2x8", &uniSumOrderUchar_2x8 );
if (input_width >= GPU_TENSOR_MAX_WIDTH)
{
status |= vsi_nn_kernel_gpu_add_param( node, "input_width", &input_width);
status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &output_width);
}
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_add_param( node, "weight_ZP", &weight_ZP);
status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP);
status |= vsi_nn_kernel_gpu_add_param( node, "scaleOut", &scaleOut);
status |= vsi_nn_kernel_gpu_add_param( node, "input_height", &input_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.dim = 2;
gpu_param.global_size[0] = (
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0]);
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(input_attr);
return status;
} /* _conv1d_ovxlib_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
_internal_kernel_e kernel_type
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e w_dtype;
vsi_nn_kernel_dtype_e b_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _conv1d_ovxlib_kernel_map;
size_t kernel_map_size = _cnt_of_array( _conv1d_ovxlib_kernel_map );
vx_param_description_t * param_def = _conv1d_ovxlib_kernel_param_def;
size_t param_def_size = _cnt_of_array( _conv1d_ovxlib_kernel_param_def );
vx_kernel_initialize_f initializer = _conv1d_ovxlib_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
w_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
b_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = CONV1D_OVXLIB_HASH_KEY( in_dtype, w_dtype, b_dtype, out_dtype, kernel_type );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = (uint32_t)param_def_size;
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_tensor_t* _create_new_bias_tensor
(
vsi_nn_graph_t *graph,
vsi_nn_tensor_t *input,
vsi_nn_tensor_t *weight,
vsi_nn_tensor_t *bias
)
{
vsi_nn_tensor_t * new_bias = NULL;
vsi_nn_tensor_attr_t attr;
int32_t *new_bias_data_ptr = NULL;
uint8_t *weight_data = NULL;
int32_t *bias_data = NULL;
uint32_t i, j;
memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
weight_data = vsi_nn_ConvertTensorToData(graph, weight);
if (bias == NULL)
{
memcpy(&attr, &weight->attr, sizeof(vsi_nn_tensor_attr_t));
attr.dim_num = 2;
attr.size[0] = weight->attr.size[2];
attr.size[1] = 1;
if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
{
attr.dtype.scale = input->attr.dtype.scale * weight->attr.dtype.scale;
attr.dtype.zero_point = 0;
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
}
}
else
{
memcpy(&attr, &bias->attr, sizeof(vsi_nn_tensor_attr_t));
if (attr.dim_num == 1)
{
attr.size[1] = 1;
attr.dim_num = 2;
}
bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias);
}
new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t));
memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]);
if (input->attr.dtype.zero_point != 0)
{
for (i = 0; i < weight->attr.size[2]; i++)
{
uint8_t *weight_ptr = weight_data + i * weight->attr.size[0] * weight->attr.size[1];
for (j = 0; j < weight->attr.size[0] * weight->attr.size[1]; j++)
{
new_bias_data_ptr[i] += -((int32_t)weight_ptr[j] - weight->attr.dtype.zero_point) \
* input->attr.dtype.zero_point;
}
}
}
if (bias_data != NULL)
{
for (i = 0; i < attr.size[0]; i++)
{
new_bias_data_ptr[i] += bias_data[i];
}
}
new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr);
vsi_nn_safe_free( new_bias_data_ptr );
vsi_nn_safe_free( bias_data );
vsi_nn_safe_free( weight_data );
return new_bias;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CONV1D_OVXLIB_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
int32_t j = 0;
_internal_kernel_e kernel_type = NORMAL;
int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" );
int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" );
int32_t overflow_policy = vsi_nn_kernel_param_get_int32( params, "overflow_policy" );
vsi_nn_tensor_t *in_tensors[3] = {NULL};
vsi_nn_tensor_t *new_bias = NULL;
if (VX_CONVERT_POLICY_SATURATE == overflow_policy)
{
overflow_policy = 1;
}
else
{
overflow_policy = 0;
}
if ( 1 == stride )
{
if ( 3 == inputs[1]->attr.size[0] )
{
if (2 == dilation || 4 == dilation)
{
kernel_type = K3_S1_D2_D4;
}
else
{
kernel_type = K3_S1;
}
}
else if ( 1024 == inputs[1]->attr.size[0] )
{
if (inputs[0]->attr.size[0] < 65535)
{
kernel_type = K1024_SMALL;
}
else if (0 == pad_front && 0 == pad_end)
{
kernel_type = K1024_LARGE;
}
else
{
return NULL;
}
}
else
{
return NULL;
}
}
if (1024 == inputs[1]->attr.size[0])
{
new_bias = _create_new_bias_tensor(graph, inputs[0], inputs[1], inputs[2]);
in_tensors[0] = inputs[0];
in_tensors[1] = inputs[1];
in_tensors[2] = new_bias;
}
else
{
in_tensors[0] = inputs[0];
in_tensors[1] = inputs[1];
in_tensors[2] = inputs[2];
}
status = _query_kernel( kernel, inputs, outputs, kernel_type );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
if( pad_front != 0 || pad_end != 0)
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U8 = (uint8_t)(inputs[0]->attr.dtype.zero_point);
status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
}
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _CONV1D_OVXLIB_PARAM_NUM,
in_tensors, input_num, outputs, output_num );
j = (int32_t)(input_num + output_num);
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &stride );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_front );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_end );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &dilation );
node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &overflow_policy );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CONV1D_OVXLIB_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
vsi_nn_kernel_scalar_release( &node_params[--j] );
}
}
if (new_bias)
{
vsi_nn_ReleaseTensor(&new_bias);
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( conv1d_ovxlib, _setup )

View File

@ -42,28 +42,44 @@ __BEGIN_DECLS
/*
* Define kernel meta.
*/
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8_blk2")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8_blk2")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16_blk2")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16_blk2")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16_blk2")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16_blk2")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8_blk2")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16_blk2")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16_blk2")
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8_BLK2 CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8_blk2")
#define KERNEL_SOURCE_1 "depth2space_crd"
// Add kernel hashtable here
#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _quant_type) \
((_input0_type << 24) | (_output_type << 16) | (_quant_type << 8))
#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _blk_size) \
((_input0_type << 24) | (_output_type << 16) | (_blk_size << 8))
#define TENSOR_DEPTH2SPACE_CRD_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 0), \
VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 1), \
VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE##_BLK2, \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
@ -80,6 +96,17 @@ static const struct {
TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, I16, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_KERNELS(U8, F16, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, U8, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(U8, U8, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I8, I8, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I16, I16, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, F16, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I8, F16, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I16, F16, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, I8, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, I16, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(U8, F16, KERNEL_SOURCE_1)
TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, U8, KERNEL_SOURCE_1)
};
/*
@ -118,9 +145,10 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
int32_t output_height = 0;
int32_t output_chn = 0;
int32_t src0ZP = 0;
float src0Scale = 0;
float src0Scale = 1.0f;
int32_t dstZP = 0;
float dstScale = 0;
float dstScale = 1.0f;
int32_t block_size = 0;
uint32_t pack_key = 0;
@ -128,12 +156,15 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size);
CHECK_STATUS_FAIL_GOTO(status, OnError );
src0ZP = attr[0]->asymm.zero_point;
src0Scale = attr[0]->asymm.scale;
dstZP = attr[1]->asymm.zero_point;
dstScale = attr[1]->asymm.scale;
if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
src0ZP = attr[0]->asymm.zero_point;
src0Scale = attr[0]->asymm.scale;
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[0]->dfp.fl > 0)
{
@ -143,27 +174,35 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
{
src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
src0ZP = 0;
}
else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
src0Scale = 1;
src0ZP = 0;
}
if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
dstZP = attr[1]->asymm.zero_point;
dstScale = attr[1]->asymm.scale;
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
if (attr[1]->dfp.fl > 0)
{
dstScale = (float)((int64_t)1 << attr[1]->dfp.fl);
dstScale = (1.0f / (float)((int64_t)1 << attr[1]->dfp.fl));
}
else
{
dstScale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
dstScale = (float)((int64_t)1 << -attr[1]->dfp.fl);
}
dstScale = 1.0f/dstScale;
dstZP = 0;
}
else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
{
dstScale = 1;
dstZP = 0;
}
output_dims = (uint32_t)attr[1]->shape->size;
@ -179,6 +218,17 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
shaderParam.global_size[1] = output_height;
shaderParam.global_size[2] = output_chn;
if (block_size == 2)
{
shaderParam.global_scale[0] = 16;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((output_width + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = output_height;
shaderParam.global_size[2] = output_chn;
}
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
@ -202,6 +252,43 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniU8MulAndPostShift_ExLo_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x19111810, 0x1b131a12, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00005600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniU8MulAndPostShift_ExHi_2x8 = {{
0xdddddddd, // TCfg
0x44444444, // ASelt
0x1d151c14, 0x1f171e16, // ABin
0x11111111, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniDepth2SpaceF16Blk2_lo_2x8 = {{
0x11111111, // TCfg
0x10101010, // ASelt
0x01010000, 0x03030202, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniDepth2SpaceF16Blk2_hi_2x8 = {{
0x11111111, // TCfg
0x10101010, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
switch( pack_key )
{
case _PACK_SELECT_KEY( U8, F16):
@ -213,14 +300,25 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
case _PACK_SELECT_KEY( U8, U8):
case _PACK_SELECT_KEY( I8, I8):
case _PACK_SELECT_KEY( I16, I16):
case _PACK_SELECT_KEY( F16, F16):
{
gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
multAndoutZP0[0] = (uint32_t)(M0);
multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_ExLo_2x8, postShift );
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_ExHi_2x8, postShift );
status = vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift_ExLo_2x8", &uniU8MulAndPostShift_ExLo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift_ExHi_2x8", &uniU8MulAndPostShift_ExHi_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniDepth2SpaceF16Blk2_lo_2x8", &uniDepth2SpaceF16Blk2_lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniDepth2SpaceF16Blk2_hi_2x8", &uniDepth2SpaceF16Blk2_hi_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
@ -256,7 +354,8 @@ static vsi_status _query_kernel
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
const vsi_nn_kernel_param_t * params
const vsi_nn_kernel_param_t * params,
int32_t blk_flg
)
{
vsi_status status = VSI_FAILURE;
@ -268,16 +367,16 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, 0 );
key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, blk_flg );
for( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ )
{
if( depth2space_crd_map[i].key == key )
if ( depth2space_crd_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(depth2space_crd_map) )
if ( i < _cnt_of_array(depth2space_crd_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", depth2space_crd_map[i].function_name );
kernel->info.parameters = _depth2space_crd_kernel_param_def;
@ -310,18 +409,19 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t tmp_params[_DEPTH2SPACE_CRD_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t blk_flg = block_size == 2 ? 1 : 0;
if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params );
if( VSI_SUCCESS == status)
status = _query_kernel( inputs, outputs, kernel, params, blk_flg);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
vsi_nn_kernel_node_pack_io( tmp_params, _DEPTH2SPACE_CRD_PARAM_NUM, inputs, 1, outputs, 1 );
tmp_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );

View File

@ -717,12 +717,13 @@ static vsi_nn_kernel_node_t _setup
int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" );
int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" );
int32_t batch = inputs[0]->attr.size[2];
_internal_kernel_size_e ks = KN;
if (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type)
if ( (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type)
&& (VSI_NN_TYPE_UINT8 == inputs[1]->attr.dtype.vx_type)
&& (NULL == inputs[2] || VSI_NN_TYPE_INT32 == inputs[2]->attr.dtype.vx_type)
&& (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type)))
&& (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type))) || batch > 1)
{
return NULL;
}
@ -769,18 +770,27 @@ static vsi_nn_kernel_node_t _setup
status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
if( pad_front != 0 && pad_end != 0)
if ( pad_front != 0 && pad_end != 0)
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U8 = 0;
border.constant_value.U16 = 0;
if (VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type &&
VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == inputs[0]->attr.dtype.qnt_type)
{
border.constant_value.U8 = (uint8_t)inputs[0]->attr.dtype.zero_point;
}
else
{
border.constant_value.U8 = 0;
border.constant_value.U16 = 0;
}
status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
}

View File

@ -48,6 +48,7 @@ typedef enum
UNARY_NEG,
UNARY_HSIGMOID,
UNARY_MISH,
UNARY_ROUND,
} unary_type_e;
/*
@ -82,6 +83,7 @@ typedef enum
#define NEG_OPERATION neg
#define HSIGMOID_OPERATION hard_sigmoid
#define MISH_OPERATION mish
#define ROUND_OPERATION round
static const struct {
uint32_t key;
@ -248,6 +250,30 @@ static const struct {
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, I8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, I16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, U8 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16, I8 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16, I16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, U8 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8, I8 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, U8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16, I8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, I16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, U8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, I8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_2D)
};
#undef SIN_OPERATION
@ -257,6 +283,7 @@ static const struct {
#undef NEG_OPERATION
#undef HSIGMOID_OPERATION
#undef MISH_OPERATION
#undef ROUND_OPERATION
/*
* Kernel params
@ -375,6 +402,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
@ -653,6 +681,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
__END_DECLS

View File

@ -0,0 +1,428 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define HASH_UNARY_KEY(_input_type, _output_type, _image_2d) \
( (_input_type << 12) | (_output_type << 4) | (_image_2d))
#define KERNEL_SOURCE "erf",
#define HASH_UNARY_SH_KERNEL_NAME( SRC_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.erf_"#SRC_TYPE"to"#DST_TYPE)
#define TENSOR_UNARY_KERNELS(SRC_TYPE, OUT_TYPE) \
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \
HASH_UNARY_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE), \
KERNEL_SOURCE },
#define HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.erf_"#SRC_TYPE"to"#DST_TYPE"_2D")
#define TENSOR_UNARY_KERNELS_2D(SRC_TYPE, OUT_TYPE) \
{ HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \
HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE), \
KERNEL_SOURCE },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _erf_kernel_map[] =
{
// Register kernel here
TENSOR_UNARY_KERNELS(F16, F16 )
TENSOR_UNARY_KERNELS(F16, I16 )
TENSOR_UNARY_KERNELS(F16, U8 )
TENSOR_UNARY_KERNELS(F16, I8 )
TENSOR_UNARY_KERNELS(I16, I16 )
TENSOR_UNARY_KERNELS(I16, F16 )
TENSOR_UNARY_KERNELS(U8, U8 )
TENSOR_UNARY_KERNELS(U8, F16 )
TENSOR_UNARY_KERNELS(I8, I8 )
TENSOR_UNARY_KERNELS(I8, F16 )
TENSOR_UNARY_KERNELS(BF16, BF16)
TENSOR_UNARY_KERNELS_2D(F16, F16 )
TENSOR_UNARY_KERNELS_2D(F16, I16 )
TENSOR_UNARY_KERNELS_2D(F16, U8 )
TENSOR_UNARY_KERNELS_2D(F16, I8 )
TENSOR_UNARY_KERNELS_2D(I16, I16 )
TENSOR_UNARY_KERNELS_2D(I16, F16 )
TENSOR_UNARY_KERNELS_2D(U8, U8 )
TENSOR_UNARY_KERNELS_2D(U8, F16 )
TENSOR_UNARY_KERNELS_2D(I8, I8 )
TENSOR_UNARY_KERNELS_2D(I8, F16 )
TENSOR_UNARY_KERNELS_2D(BF16, BF16)
};
/*
* Kernel params
*/
static vx_param_description_t _erf_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
};
#define _ERF_PARAM_NUM _cnt_of_array( _erf_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_erf_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL };
vsi_int_array_t * out_shape = NULL;
float inputScale = 1.0f;
float inputTail = 0;
float outputScale = 1.0f;
float outputZP = 0;
uint32_t pack_key;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
out_shape = attr[1]->shape;
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[0]->dfp.fl;
if (fl > 0)
{
inputScale = 1.0f / (float) ((int64_t)1 << fl);
}
else
{
inputScale = (float)((int64_t)1 << -fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
inputScale = attr[0]->asymm.scale;
inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
}
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
int32_t fl = attr[1]->dfp.fl;
if (fl > 0)
{
outputScale = (float)((int64_t)1 << fl);
}
else
{
outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
outputScale = (float)1.0f / attr[1]->asymm.scale;
outputZP = (float)attr[1]->asymm.zero_point;
}
#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE ) \
( ( IN_TYPE << 16) | ( OUT_TYPE << 8))
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
switch ( pack_key )
{
case _PACK_SELECT_KEY( BF16, BF16 ):
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
default:
{
gpu_dp_inst_t uniExtractHalf8_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x06040200, 0x06040200, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractInteger_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniDatatoFp32Part0_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000100, // AccumType, ConstantType, and PostShift
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node,
"uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"inputScale", &inputScale );
status |= vsi_nn_kernel_gpu_add_param( node,
"inputTail", &inputTail );
status |= vsi_nn_kernel_gpu_add_param( node,
"outputScale", &outputScale );
status |= vsi_nn_kernel_gpu_add_param( node,
"outputZP", &outputZP );
if (attr[1]->dtype == F16)
{
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtract8Data_2x8", &uniExtractHalf8_2x8 );
}
else
{
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
}
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
}
#undef _PACK_SELECT_KEY
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(attr[0]);
SAFE_FREE_TENSOR_ATTR(attr[1]);
#undef SAFE_FREE_TENSOR_ATTR
return status;
} /* _erf_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool image_2d
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _erf_kernel_map;
size_t kernel_map_size = _cnt_of_array( _erf_kernel_map );
vx_param_description_t * param_def = _erf_kernel_param_def;
vx_kernel_initialize_f initializer = _erf_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_UNARY_KEY( in_dtype, out_dtype, image_2d );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _erf_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
int32_t new_rank = 0;
vsi_bool image_2d = FALSE;
vsi_bool ret = FALSE;
ret = vsi_nn_kernel_optimize_element_shape(
(int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num,
shape, &new_rank );
if ( ret )
{
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], (uint32_t*)shape, new_rank );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shape, new_rank );
}
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size,
rs_tensors[0]->attr.dim_num ) )
{
goto OnError;
}
image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1);
status = _query_kernel( kernel, inputs, outputs, image_2d );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
rs_tensors, 1, &rs_tensors[1], 1 );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
}
}
OnError:
if (rs_tensors[0])
{
vsi_nn_ReleaseTensor( &rs_tensors[0] );
}
if (rs_tensors[1])
{
vsi_nn_ReleaseTensor( &rs_tensors[1] );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( erf, _setup )

View File

@ -64,39 +64,60 @@ __BEGIN_DECLS
#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0")
#define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_array")
#define VX_KERNEL_NAME_GATHER_ARRAY_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_array")
#define VX_KERNEL_NAME_GATHER_ARRAY_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_array")
#define VX_KERNEL_NAME_GATHER_ARRAY_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_F16toF16_array")
#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_axis0_array")
#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_axis0_array")
#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_axis0_array")
#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_F16toF16_axis0_array")
#define KERNEL_SOURCE_1 "gather"
#define KERNEL_SOURCE_2 "gather_mix"
#define KERNEL_SOURCE_3 "gather_array"
// Add kernel hashtable here
#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0))
#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0, _is_max) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0 << 4) | (_is_max))
#define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1), \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \
VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \
VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_AXIS0_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1), \
VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} gather_map[] =
{
TENSOR_GATHER_KERNELS(U8, I32, U8, KERNEL_SOURCE_1)
TENSOR_GATHER_KERNELS(I8, I32, I8, KERNEL_SOURCE_1)
TENSOR_GATHER_KERNELS(I16, I32, I16, KERNEL_SOURCE_1)
TENSOR_GATHER_KERNELS(F16, I32, F16, KERNEL_SOURCE_1)
TENSOR_GATHER_KERNELS(I8, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(I16, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(F16, I32, I8, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(F16, I32, I16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(U8, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(F16, I32, U8, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(U8, I32, U8, KERNEL_SOURCE_1)
TENSOR_GATHER_KERNELS(I8, I32, I8, KERNEL_SOURCE_1)
TENSOR_GATHER_KERNELS(I16, I32, I16, KERNEL_SOURCE_1)
TENSOR_GATHER_KERNELS(F16, I32, F16, KERNEL_SOURCE_1)
TENSOR_GATHER_KERNELS(I8, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(I16, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(F16, I32, I8, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(F16, I32, I16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(U8, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_KERNELS(F16, I32, U8, KERNEL_SOURCE_2)
TENSOR_GATHER_AXIS0_KERNELS(U8, I32, U8, KERNEL_SOURCE_1)
TENSOR_GATHER_AXIS0_KERNELS(I8, I32, I8, KERNEL_SOURCE_1)
TENSOR_GATHER_AXIS0_KERNELS(I16, I32, I16, KERNEL_SOURCE_1)
@ -107,6 +128,14 @@ static const struct {
TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I16, KERNEL_SOURCE_2)
TENSOR_GATHER_AXIS0_KERNELS(U8, I32, F16, KERNEL_SOURCE_2)
TENSOR_GATHER_AXIS0_KERNELS(F16, I32, U8, KERNEL_SOURCE_2)
TENSOR_GATHER_ARRAY_KERNELS(U8, I32, U8, KERNEL_SOURCE_3)
TENSOR_GATHER_ARRAY_KERNELS(I8, I32, I8, KERNEL_SOURCE_3)
TENSOR_GATHER_ARRAY_KERNELS(I16, I32, I16, KERNEL_SOURCE_3)
TENSOR_GATHER_ARRAY_KERNELS(F16, I32, F16, KERNEL_SOURCE_3)
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(U8, I32, U8, KERNEL_SOURCE_3)
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I8, I32, I8, KERNEL_SOURCE_3)
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I16, I32, I16, KERNEL_SOURCE_3)
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(F16, I32, F16, KERNEL_SOURCE_3)
};
/*
@ -129,7 +158,8 @@ static vsi_status get_gather_tensor_reshape_size
vsi_nn_tensor_t ** inputs,
int32_t sizes[VSI_NN_MAX_DIM_NUM],
uint32_t block_size,
uint32_t idxFlg
uint32_t idxFlg,
int32_t* arrayFlg
)
{
vsi_status status = VSI_FAILURE;
@ -157,12 +187,13 @@ static vsi_status get_gather_tensor_reshape_size
}
else
{
if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH)
{
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
status = VSI_SUCCESS;
arrayFlg[0] = 1;
}
status = VSI_SUCCESS;
}
#undef VSI_NN_MAX_IMAGE_WIDTH
@ -535,10 +566,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( I16, I16):
case _PACK_SELECT_KEY( I8, I8):
case _PACK_SELECT_KEY( U8, U8):
case _PACK_SELECT_KEY( F16, F16):
case _PACK_SELECT_KEY( I16, I16):
case _PACK_SELECT_KEY( I8, I8):
case _PACK_SELECT_KEY( U8, U8):
case _PACK_SELECT_KEY( F16, F16):
case _PACK_SELECT_KEY( BF16, BF16):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniExtraCopyDpKeepinEvis_2x8", &uniExtraCopyDpKeepinEvis_2x8 );
@ -583,7 +615,8 @@ static vsi_status _query_kernel
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel,
const vsi_nn_kernel_param_t * params,
int32_t axis
int32_t axis,
int32_t is_array
)
{
vsi_status status = VSI_FAILURE;
@ -595,7 +628,16 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis );
if (input0_dtype == BF16)
{
input0_dtype = F16;
}
if (output_dtype == BF16)
{
output_dtype = F16;
}
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array);
for( i = 0; i < _cnt_of_array(gather_map); i ++ )
{
@ -640,6 +682,7 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_t * kernel
)
{
#define VSI_NN_MAX_BLOCK_SIZE (65536)
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
@ -649,21 +692,23 @@ static vsi_nn_kernel_node_t _setup
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" );
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
int32_t axis0_flg = 0;
int32_t is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
if (axis == 0)
{
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0);
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0, &is_array);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0, &is_array);
axis0_flg = 1;
}
else
{
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0, &is_array);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &is_array);
axis0_flg = 0;
}
#undef VSI_NN_MAX_BLOCK_SIZE
if (status != VSI_SUCCESS)
{
return NULL;
@ -675,7 +720,7 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params, axis0_flg);
status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );

View File

@ -387,6 +387,15 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (input0_dtype == BF16)
{
input0_dtype = F16;
}
if (output_dtype == BF16)
{
output_dtype = F16;
}
if(coord_dim == 1)
{
coord_type = _1D;

File diff suppressed because it is too large Load Diff

View File

@ -53,6 +53,10 @@ typedef enum
#define KERNEL_SOURCE_2 "instance_normalization_u8"
#define KERNEL_SOURCE_3 "instance_normalization_i16"
#define KERNEL_SOURCE_4 "instance_normalization_f16"
#define KERNEL_SOURCE_5 "instance_normalization_u8_f16"
#define KERNEL_SOURCE_6 "instance_normalization_scale_f32"
#define KERNEL_SOURCE_7 "instance_normalization_scale_f32_f16"
#define KERNEL_SOURCE_8 "instance_normalization_scale_f32_bf16"
#define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE)
@ -66,6 +70,12 @@ typedef enum
#define HASH_INSTANCENORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
#define HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE)
#define HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D")
// Add kernel hashtable here
// mean vari
#define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \
@ -82,19 +92,29 @@ typedef enum
SOURCE },
// normalization
#define HASH_INSTANCENORM_KEY(_input0_type, _output_type, _reshape_flag) \
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
#define HASH_INSTANCENORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4))
#define TENSOR_INSTANCENORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
{ HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \
HASH_INSTANCENORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_INSTANCENORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 1), \
{ HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \
HASH_INSTANCENORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_INSTANCENORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 0), \
HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_INSTANCENORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 1), \
HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
typedef struct
{
uint32_t key;
@ -113,6 +133,8 @@ static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] =
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_4 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( BF16, F32, KERNEL_SOURCE_8 )
TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( BF16, F32, KERNEL_SOURCE_8 )
};
static const _kernel_map_type _instancenorm_kernel_map[] =
@ -125,8 +147,8 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
TENSOR_INSTANCENORM_KERNELS( U8, U8, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_2 )
TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_5 )
TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 )
TENSOR_INSTANCENORM_KERNELS( I16, I16, KERNEL_SOURCE_3 )
TENSOR_INSTANCENORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 )
@ -135,6 +157,21 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_4 )
TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 )
TENSOR_INSTANCENORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_6 )
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_6 )
TENSOR_INSTANCENORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_6 )
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_6 )
TENSOR_INSTANCENORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_6 )
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 )
TENSOR_INSTANCENORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_7 )
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_7 )
TENSOR_INSTANCENORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_8 )
TENSOR_INSTANCENORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_8 )
};
/*
@ -254,7 +291,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
{
shaderParam.global_size[0] = (width + 255) / 256 * 16;
}
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16)
{
shaderParam.global_size[0] = (width + 127) / 128 * 16;
}
@ -350,6 +387,32 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
else if (attr[0]->dtype == BF16)
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
@ -385,15 +448,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL};
vsi_int_array_t * input_shape = NULL;
float scaleIn = 1.0f;
float scaleOut = 1.0f;
float reScaleOut_u8 = 1.0f;
float scale_inOut = 1.0f;
int32_t output_zp = 0;
int32_t input_zp = 0;
float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1;
float dimRatio = 0;
vx_uint32 group_num = 0;
vx_int32 height = 0, width = 0, chn = 0;
@ -401,10 +462,12 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rsFlg);
CHECK_STATUS_FAIL_GOTO(status, OnError );
@ -420,43 +483,39 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
{
if (attr[0]->dfp.fl > 0)
{
in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
}
else
{
in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
}
input_zp = 0;
}
if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
{
output_zp = attr[2]->asymm.zero_point;
scaleOut = attr[2]->asymm.scale;
reScaleOut_u8 = 1 / scaleOut;
output_zp = attr[3]->asymm.zero_point;
scaleOut = attr[3]->asymm.scale;
scaleOut = 1 / scaleOut;
}
else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
else if (attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP)
{
if (attr[2]->dfp.fl > 0)
if (attr[3]->dfp.fl > 0)
{
out_scale_fl = (float)((int64_t)1 << attr[2]->dfp.fl);
scaleOut = (float)((int64_t)1 << attr[3]->dfp.fl);
}
else
{
out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
scaleOut = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
}
output_zp = 0;
}
if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
&& (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP))
{
inOut_fl_scale = in_scale_fl * out_scale_fl;
}
scale_inOut = scaleIn * scaleOut;
width = input_shape->data[0];
height = input_shape->data[1];
chn = attr[1]->shape->data[1];
chn = attr[2]->shape->data[1];
if (rsFlg)
{
height = height / chn;
@ -467,7 +526,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
group_num = (width + 255) / 256;
shaderParam.global_scale[0] = 16;
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16)
{
shaderParam.global_scale[0] = 8;
group_num = (width + 127) / 128;
@ -630,23 +689,52 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
}, GPU_DP_TYPE_16 };
uint32_t pack_key = 0;
#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE ) \
(IN0_TYPE | (OUT_TYPE << 8))
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
uint32_t pack_key = 0;
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
(IN0_TYPE | (IN1_TYPE << 8) | (OUT_TYPE << 16))
pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[3]->dtype );
status = vsi_nn_kernel_gpu_add_param(node, "height", &height);
status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
switch( pack_key )
{
case _PACK_SELECT_KEY( I8, I8 ):
case _PACK_SELECT_KEY( I8, F16 ):
case _PACK_SELECT_KEY( I8, F16, I8 ):
case _PACK_SELECT_KEY( I8, F16, F16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
&uniConvertInt32toUint8_2x8);
@ -658,15 +746,42 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
&uniConvertTrdUint8Fp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4",
&uniConvertFthUint8Fp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
&uniConvertHalfToFp16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U8, U8 ):
case _PACK_SELECT_KEY( U8, F16 ):
case _PACK_SELECT_KEY( U8, F16, U8 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
&uniConvertInt32toUint8_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
&uniConvert1stUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
&uniConvert2ndUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
&uniConvert3rdUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
&uniConvert4thUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U8, F32, U8 ):
case _PACK_SELECT_KEY( I8, F32, I8 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
&uniConvertInt32toUint8_2x8);
@ -679,37 +794,85 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
&uniConvert4thUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8);
scale_inOut = reScaleOut_u8 * scaleIn;
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( I16, I16 ):
case _PACK_SELECT_KEY( I16, F16 ):
case _PACK_SELECT_KEY( U8, F16, F16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
&uniConvert1stUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
&uniConvert2ndUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
&uniConvert3rdUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
&uniConvert4thUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
&uniConvertHalfToFp16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( I16, F16, I16 ):
case _PACK_SELECT_KEY( I16, F16, F16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
&uniConvertInt16Fp32Fst_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
&uniConvertInt16Fp32Secd_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
&uniConvertInt32toInt16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
&uniConvertHalfToFp16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( F16, F16 ):
case _PACK_SELECT_KEY( I16, F32, I16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
&uniConvertInt16Fp32Fst_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
&uniConvertInt16Fp32Secd_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
&uniConvertInt32toInt16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( F16, F16, F16 ):
case _PACK_SELECT_KEY( F16, F32, F16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4",
&uniConvertEndInt16Fp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
&uniConvertHalfToFp16_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, F32, BF16 ):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
@ -736,6 +899,11 @@ OnError:
vsi_nn_kernel_tensor_attr_release( &attr[2] );
attr[2] = NULL;
}
if (attr[3])
{
vsi_nn_kernel_tensor_attr_release( &attr[3] );
attr[3] = NULL;
}
return status;
}
@ -826,11 +994,13 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t tmp_node = NULL;
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_dtype_e in0_dtype = U8;
vsi_nn_kernel_dtype_e in1_dtype = F16;
vsi_nn_kernel_dtype_e out_dtype = U8;
vsi_nn_tensor_attr_t attr;
vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
uint32_t hashkey = 0;
int32_t i = 0;
@ -851,29 +1021,12 @@ static vsi_nn_kernel_node_t _setup
ikernels[i]->unique_id = kernel->unique_id;
}
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
attr.is_const = FALSE;
attr.vtl = TRUE;
attr.size[0] = ((inputs[0]->attr.size[0] + 255) / 256) * 4;
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
{
attr.size[0] = ((inputs[0]->attr.size[0] + 127) / 128) * 4;
}
attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
attr.size[2] = 1;
attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
attr.dim_num = 4;
tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg );
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
hashkey = HASH_INSTANCENORM_KEY( in0_dtype, in1_dtype, out_dtype, reshape_flg );
status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
if ( VSI_SUCCESS != status )
@ -888,22 +1041,54 @@ static vsi_nn_kernel_node_t _setup
if (reshape_flg)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[0]->attr.size[0];
shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2];
shape[2] = 1;
shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
shape[0] = outputs[0]->attr.size[0];
shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2];
shape[2] = 1;
shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
}
else if (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH)
{
shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
shape[1] = 1;
shape[2] = inputs[0]->attr.size[2];
shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
}
else if (inputs[0]->attr.size[0] < inputs[0]->attr.size[1])
{
shape[0] = inputs[0]->attr.size[1];
shape[1] = inputs[0]->attr.size[0];
shape[2] = inputs[0]->attr.size[2];
shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
}
else
{
shape[0] = inputs[0]->attr.size[0];
}
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
attr.is_const = FALSE;
attr.vtl = TRUE;
attr.size[0] = ((shape[0] + 255) / 256) * 4;
if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
|| inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
{
attr.size[0] = ((shape[0] + 127) / 128) * 4;
}
attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
attr.size[2] = 1;
attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
attr.dim_num = 4;
tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
if (inputs[1]->attr.dim_num < 2)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[1]->attr.size[0];
shape[1] = 1;
shape[2] = 1;
@ -912,7 +1097,6 @@ static vsi_nn_kernel_node_t _setup
}
if (inputs[2]->attr.dim_num < 2)
{
int32_t shape[VSI_NN_MAX_DIM_NUM] = {0};
shape[0] = inputs[2]->attr.size[0];
shape[1] = 1;
shape[2] = 1;
@ -925,7 +1109,7 @@ static vsi_nn_kernel_node_t _setup
if (tmp_node)
{
uint32_t index = 0;
if (reshape_flg)
if (rs_input)
{
mean_vari_node_params[index++] = rs_input;
vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index],
@ -967,7 +1151,7 @@ static vsi_nn_kernel_node_t _setup
if (node)
{
uint32_t index = 0;
if (reshape_flg)
if (rs_input)
{
node_params[index++] = rs_input;
}
@ -992,7 +1176,7 @@ static vsi_nn_kernel_node_t _setup
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
}
node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
if (reshape_flg)
if (rs_output)
{
node_params[index++] = rs_output;
}
@ -1034,9 +1218,12 @@ final:
{
vsi_nn_kernel_tensor_release( &rs_gamma );
}
if (reshape_flg)
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )

View File

@ -60,6 +60,9 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_5 "layer_normalization_wh_f16"
#define KERNEL_SOURCE_6 "layer_normalization_i16"
#define KERNEL_SOURCE_7 "layer_normalization_wh_i16"
#define KERNEL_SOURCE_8 "layer_normalization_scale_f32"
#define KERNEL_SOURCE_9 "layer_normalization_scale_f32_2d"
#define KERNEL_SOURCE_10 "layer_normalization_scale_f32_bf16"
#define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
@ -68,20 +71,36 @@ __BEGIN_DECLS
#define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
#define HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE)
#define HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D")
// normalization
#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \
((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
#define HASH_LAYERNORM_KEY(_input0_type, _input2_type, _output_type, _reshape_flag) \
((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | _reshape_flag)
#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_KERNEL), \
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_KERNEL), \
HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_2D_KERNEL), \
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_2D_KERNEL), \
HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_LAYERNORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_KERNEL), \
HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_LAYERNORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_2D_KERNEL), \
HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
// greater than max size
#define HASH_SUMSQR_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE)
@ -96,22 +115,22 @@ __BEGIN_DECLS
CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE"_2D")
#define TENSOR_SUMSQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_KERNEL), \
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_KERNEL), \
HASH_SUMSQR_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_SUMSQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_2D_KERNEL), \
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_2D_KERNEL), \
HASH_SUMSQR_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_LAYERNORM_WH_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_KERNEL), \
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_KERNEL), \
HASH_LAYERNORM_WH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
#define TENSOR_LAYERNORM_WH_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \
{ HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \
HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
SOURCE },
@ -136,6 +155,17 @@ static const _kernel_map_type _layernorm_kernel_map[] =
TENSOR_LAYERNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_2 )
TENSOR_LAYERNORM_KERNELS( I16, I16, KERNEL_SOURCE_6 )
TENSOR_LAYERNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 )
TENSOR_LAYERNORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_8 )
TENSOR_LAYERNORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_9 )
TENSOR_LAYERNORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_8 )
TENSOR_LAYERNORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_9 )
TENSOR_LAYERNORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_8 )
TENSOR_LAYERNORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_9 )
TENSOR_LAYERNORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_8 )
TENSOR_LAYERNORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_9 )
TENSOR_LAYERNORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_10 )
TENSOR_LAYERNORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_10 )
};
static const _kernel_map_type _sumsqr_kernel_map[] =
@ -295,8 +325,7 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = 1;
shaderParam.global_size[1] = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
/ shaderParam.global_scale[1], 4);
shaderParam.global_size[1] = height;
shaderParam.global_size[2] = chn;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
@ -424,6 +453,37 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
uint32_t pack_key = 0;
#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE ) \
(IN0_TYPE | (IN1_TYPE << 16) | (OUT_TYPE << 8))
@ -432,9 +492,6 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
switch( pack_key )
@ -453,6 +510,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
&uniConvert3rdUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
&uniConvert4thUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
&uniConvertSecFp16Fp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
@ -481,6 +543,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
&uniConvert3rdUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
&uniConvert4thUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
&uniConvertSecFp16Fp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
@ -501,7 +568,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
&uniConvert2ndUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
&uniConvertInt32toUint8_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
&uniConvertSecFp16Fp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
@ -510,6 +581,70 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U8, F32, U8 ):
case _PACK_SELECT_KEY( F16, F32, F16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2",
&uniFp16SumSqr_dp8x2);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4",
&uniExtractHalf4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
&uniConvertInt32toUint8_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
&uniConvert1stUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
&uniConvert2ndUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
&uniConvert3rdUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
&uniConvert4thUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( I16, F32, I16 ):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2",
&uniInt16SumSqr_dp8x2);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
&uniConvert1stUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
&uniConvert2ndUint8SubZpToFp32_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
&uniConvertInt32toUint8_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
&UniFP16toFP32Lo4_dp4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, F32, BF16 ):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
default:
VSI_ASSERT( FALSE );
return VSI_FAILURE;
@ -949,6 +1084,7 @@ static vsi_status _query_kernel
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e input2_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
int i = 0;
@ -960,9 +1096,10 @@ static vsi_status _query_kernel
}
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, kernel_type );
key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, kernel_type );
for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ )
{
@ -1000,14 +1137,16 @@ static vsi_status _query_kernel_wh
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e input2_dtype = F16;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_LAYERNORM_KEY( input0_dtype, F32, is2D_sumsqr );
key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, F32, is2D_sumsqr );
for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
{
@ -1031,7 +1170,7 @@ static vsi_status _query_kernel_wh
}
key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, is2D_wh );
key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, is2D_wh );
for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
{
@ -1256,17 +1395,25 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
int32_t rs_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
int32_t wh_flg = vsi_nn_kernel_param_get_int32( params, "wh_flg" );
int32_t optFlg = rs_flg || (outputs[0]->attr.dim_num < 3);
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
uint32_t *input_size = inputs[0]->attr.size;
uint32_t dims_num = inputs[0]->attr.dim_num;
int32_t rs_flg = 0;
int32_t optFlg = 0;
if (wh_flg)
if (input_size[0] >= GPU_TENSOR_MAX_WIDTH)
{
node = _setup_wh(graph, inputs, input_num, outputs, output_num, params, kernel);
goto final;
}
if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH)
&& dims_num > 2)
{
rs_flg = 1;
}
optFlg = rs_flg || (outputs[0]->attr.dim_num < 3);
status = _query_kernel( inputs, outputs, kernel, optFlg);
if (VSI_SUCCESS != status)
{

View File

@ -241,7 +241,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
pack_key = _PACK_SELECT_KEY( attr[0]->dtype,
attr[1]->dtype, attr[2]->dtype );
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16)
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
|| ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
{
gpu_param.global_scale[0] = 8;

View File

@ -241,7 +241,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
pack_key = _PACK_SELECT_KEY( attr[0]->dtype,
attr[1]->dtype, attr[2]->dtype );
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16)
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
|| ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
{
gpu_param.global_scale[0] = 8;

View File

@ -0,0 +1,460 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
__BEGIN_DECLS
#define _ONE_HOT_KERNEL_SOURCE "one_hot"
// Add kernel hashtable here
#define HASH_ONE_HOT_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \
CVIVANTE_NAMESPACE("evis.one_hot_"#SRC_TYPE"to"#DST_TYPE)
#define ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, IMG_2D ) \
(( IN_DTYPE << 9 ) | ( OUT_DTYPE << 1) | (IMG_2D))
#define PACK_ONE_HOT_KERNEL_3D( IN_DTYPE, OUT_DTYPE ) \
{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \
CVIVANTE_NAMESPACE("evis.one_hot_"#IN_DTYPE"to"#OUT_DTYPE), \
_ONE_HOT_KERNEL_SOURCE }
#define PACK_ONE_HOT_KERNEL_2D( IN_DTYPE, OUT_DTYPE ) \
{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1 ), \
CVIVANTE_NAMESPACE("evis.one_hot_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
_ONE_HOT_KERNEL_SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _one_hot_kernel_map[] =
{
// Register kernel here
PACK_ONE_HOT_KERNEL_3D( U8, U8 ),
PACK_ONE_HOT_KERNEL_3D( U8, F16 ),
PACK_ONE_HOT_KERNEL_3D( I8, I8 ),
PACK_ONE_HOT_KERNEL_3D( I8, F16 ),
PACK_ONE_HOT_KERNEL_3D( I16, I16 ),
PACK_ONE_HOT_KERNEL_3D( I16, F16 ),
PACK_ONE_HOT_KERNEL_3D( F16, F16 ),
PACK_ONE_HOT_KERNEL_3D( F16, I16 ),
PACK_ONE_HOT_KERNEL_3D( F16, U8 ),
PACK_ONE_HOT_KERNEL_3D( F16, I8 ),
PACK_ONE_HOT_KERNEL_2D( U8, U8 ),
PACK_ONE_HOT_KERNEL_2D( U8, F16 ),
PACK_ONE_HOT_KERNEL_2D( I8, I8 ),
PACK_ONE_HOT_KERNEL_2D( I8, F16 ),
PACK_ONE_HOT_KERNEL_2D( I16, I16 ),
PACK_ONE_HOT_KERNEL_2D( I16, F16 ),
PACK_ONE_HOT_KERNEL_2D( F16, F16 ),
PACK_ONE_HOT_KERNEL_2D( F16, I16 ),
PACK_ONE_HOT_KERNEL_2D( F16, U8 ),
PACK_ONE_HOT_KERNEL_2D( F16, I8 ),
};
/*
* Kernel params
*/
static vx_param_description_t _one_hot_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define SCALAR_INPUT_SUFFIX_SIZE (2)
#define SCALAR_INPUT_ON_VALUE (3)
#define SCALAR_INPUT_OFF_VALUE (4)
#define _ONE_HOT_PARAM_NUM _cnt_of_array( _one_hot_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_one_hot_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
gpu_param_t gpu_param = {
2, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0} // globalWorkSize: image size in thread
};
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
vsi_int_array_t * in_shape = NULL;
int32_t suffix_size = 0;
int32_t depth = 0;
int32_t input_zp = 0;
float scaleIn = 1.0f;
int32_t srcFixPointPos = 0;
vsi_nn_kernel_dtype_e input_dtype = F16;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SUFFIX_SIZE], &(suffix_size));
in_shape = attr[0]->shape;
depth = attr[1]->shape->data[1];
input_dtype = attr[0]->dtype;
if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant)
{
srcFixPointPos = attr[0]->dfp.fl;
}
else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant)
{
input_zp = attr[0]->asymm.zero_point;
scaleIn = attr[0]->asymm.scale;
}
if (suffix_size == 1)
{
gpu_param.global_scale[0] = 4;
gpu_param.global_scale[1] = 1;
depth = attr[1]->shape->data[0];
}
else
{
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
}
gpu_param.global_size[0] = gpu_align_p2(
(in_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = in_shape->data[1];
switch (input_dtype)
{
case I16:
case I8:
case F16:
{
gpu_dp_inst_t uniDataConvert_0_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniDataConvert_1_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractInteger_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_update_postshfit( &uniDataConvert_0_4x4, srcFixPointPos );
gpu_dp_inst_update_postshfit( &uniDataConvert_1_4x4, srcFixPointPos );
status = vsi_nn_kernel_gpu_add_param( node,
"uniDataConvert_0_4x4", &uniDataConvert_0_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniDataConvert_1_4x4", &uniDataConvert_1_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"depth", &depth );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case U8:
{
gpu_dp_inst_t uniDataConvert_0_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00010000, 0x00030002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniDataConvert_1_4x4 = {{
0x01010101, // TCfg
0x00000000, // ASelt
0x00050004, 0x00070006, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniExtractInteger_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
float input_tail = 0 - (float)input_zp * scaleIn;
status = vsi_nn_kernel_gpu_add_param( node,
"uniDataConvert_0_4x4", &uniDataConvert_0_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniDataConvert_1_4x4", &uniDataConvert_1_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"input_scale", &scaleIn );
status |= vsi_nn_kernel_gpu_add_param( node,
"input_tail", &input_tail );
status |= vsi_nn_kernel_gpu_add_param( node,
"depth", &depth );
CHECK_STATUS_FAIL_GOTO(status, final );
}
default:
break;
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(attr[0]);
SAFE_FREE_TENSOR_ATTR(attr[1]);
#undef SAFE_FREE_TENSOR_ATTR
return status;
} /* _one_hot_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool image_2d
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _one_hot_kernel_map;
size_t kernel_map_size = _cnt_of_array( _one_hot_kernel_map );
vx_param_description_t * param_def = _one_hot_kernel_param_def;
vx_kernel_initialize_f initializer = _one_hot_initializer;
uint32_t key;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = ONE_HOT_HASH_KEY( in_dtype, out_dtype, image_2d );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _one_hot_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_nn_tensor_t* rs_tensors[2] = { NULL };
int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
int32_t i = 0;
vsi_bool image_2d = FALSE;
int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
int32_t prefix_dim_size = 1;
int32_t suffix_dim_size = 0;
int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
uint32_t data_u32[2] = {0};
float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
vsi_nn_Float32ToDtype(on_value, (uint8_t*)&data_u32[0], &outputs[0]->attr.dtype);
vsi_nn_Float32ToDtype(off_value, (uint8_t*)&data_u32[1], &outputs[0]->attr.dtype);
axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;
for (i = 0; i < axis; i++)
{
prefix_dim_size *= inputs[0]->attr.size[i];
}
suffix_dim_size = num_elements / prefix_dim_size;
if (suffix_dim_size == 1)
{
shape[0][0] = prefix_dim_size;
shape[0][1] = 1;
shape[1][0] = depth;
shape[1][1] = prefix_dim_size;
shape[1][2] = 1;
}
else
{
shape[0][0] = suffix_dim_size;
shape[0][1] = prefix_dim_size;
shape[1][0] = suffix_dim_size;
shape[1][1] = depth;
shape[1][2] = prefix_dim_size;
}
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], (uint32_t*)shape[0], 2 );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], (uint32_t*)shape[1], 3 );
if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
rs_tensors[1]->attr.dim_num ) )
{
return NULL;
}
image_2d = suffix_dim_size == 1;
status = _query_kernel( kernel, inputs, outputs, image_2d );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
&rs_tensors[0], input_num, &rs_tensors[1], output_num );
node_params[SCALAR_INPUT_SUFFIX_SIZE] = vsi_nn_kernel_scalar_create(
graph, I32, &suffix_dim_size );
node_params[SCALAR_INPUT_ON_VALUE] = vsi_nn_kernel_scalar_create(
graph, U32, &data_u32[0] );
node_params[SCALAR_INPUT_OFF_VALUE] = vsi_nn_kernel_scalar_create(
graph, U32, &data_u32[1] );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
CHECK_STATUS_FAIL_GOTO( status, final );
}
}
final:
if (rs_tensors[0])
{
vsi_nn_ReleaseTensor( &rs_tensors[0] );
}
if (rs_tensors[1])
{
vsi_nn_ReleaseTensor( &rs_tensors[1] );
}
for (i = SCALAR_INPUT_SUFFIX_SIZE; i < _ONE_HOT_PARAM_NUM; i++)
{
if (node_params[i])
{
vsi_nn_kernel_scalar_release( &node_params[i] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( one_hot, _setup )

View File

@ -202,8 +202,28 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractRtoF32_part1_4x4 = {{
0x01010101, // TCfg
0x01010100, // ASelt
0x0000000c, 0x00060003, // ABin
0x01010000, // ASelt
0x000f000c, 0x00050002, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractRtoF32_part2_4x4 = {{
0x01010101, // TCfg
0x01000000, // ASelt
0x000b0008, 0x0001000e, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractRtoF32_part3_4x4 = {{
0x01010101, // TCfg
0x01010101, // ASelt
0x00070004, 0x000d000a, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
@ -223,7 +243,27 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
gpu_dp_inst_t uniExtractGtoF32_part1_4x4 = {{
0x01010101, // TCfg
0x01010100, // ASelt
0x0001000d, 0x00070004, // ABin
0x0000000d, 0x00060003, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractGtoF32_part2_4x4 = {{
0x01010101, // TCfg
0x01000000, // ASelt
0x000c0009, 0x0002000f, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractGtoF32_part3_4x4 = {{
0x01010101, // TCfg
0x01010101, // ASelt
0x00080005, 0x000e000b, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
@ -243,7 +283,27 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
gpu_dp_inst_t uniExtractBtoF32_part1_4x4 = {{
0x01010101, // TCfg
0x01010100, // ASelt
0x0002000e, 0x00080005, // ABin
0x0001000e, 0x00070004, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractBtoF32_part2_4x4 = {{
0x01010101, // TCfg
0x01010000, // ASelt
0x000d000a, 0x00030000, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000000, 0x00000001, 0x00000000,
0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractBtoF32_part3_4x4 = {{
0x01010101, // TCfg
0x01010101, // ASelt
0x00090006, 0x000f000c, // ABin
0x02020202, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
@ -358,7 +418,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
case _PACK_SELECT_KEY( 1, 0, 0): // copy
case _PACK_SELECT_KEY( 1, 2, 0): // copy reorder
{
shaderParam.global_scale[0] = 8;
if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
{
shaderParam.global_scale[0] = 16;
}
else
{
shaderParam.global_scale[0] = 8;
}
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
@ -366,7 +433,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
shaderParam.global_size[1] = height;
shaderParam.global_size[2] = 1;
if(attr[0]->dtype == F16)
if (attr[0]->dtype == F16)
{
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
}
@ -376,10 +443,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
}
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part0_4x4", &uniExtractRtoF32_part0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part1_4x4", &uniExtractRtoF32_part1_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part2_4x4", &uniExtractRtoF32_part2_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part3_4x4", &uniExtractRtoF32_part3_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part0_4x4", &uniExtractGtoF32_part0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part1_4x4", &uniExtractGtoF32_part1_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part2_4x4", &uniExtractGtoF32_part2_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part3_4x4", &uniExtractGtoF32_part3_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part0_4x4", &uniExtractBtoF32_part0_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part1_4x4", &uniExtractBtoF32_part1_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part2_4x4", &uniExtractBtoF32_part2_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part3_4x4", &uniExtractBtoF32_part3_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder);
status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1);
CHECK_STATUS_FAIL_GOTO(status, OnError);

View File

@ -43,6 +43,7 @@ __BEGIN_DECLS
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toF16")
#define KERNEL_SOURCE_1 "pre_process_yuv420_scale_u8",
#define KERNEL_SOURCE_2 "pre_process_yuv420_copy_u8",
@ -77,6 +78,7 @@ static const struct {
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_5)
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8, COPY, KERNEL_SOURCE_2)
TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY, KERNEL_SOURCE_2)
};
static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
@ -155,10 +157,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
}
shaderParam.global_scale[0] = 16;
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
{
shaderParam.global_scale[0] = 8;
}
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
@ -418,6 +416,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
switch( attr[0]->dtype )
{
case U8:
case F16:
{
// R
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4);
@ -866,7 +865,7 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (enable_copy && output_dtype == U8)
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
{
convert_type = COPY;
}
@ -890,7 +889,7 @@ static vsi_status _query_kernel
kernel->info.parameters = vxPreProcessYuv420Kernel_param_def;
kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def );
if (enable_copy && output_dtype == U8)
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
{
kernel->info.initialize = _pre_process_yuv420_copy_initializer;
}

View File

@ -43,6 +43,7 @@ __BEGIN_DECLS
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOU8 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toU8")
#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOF16 CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toF16")
#define KERNEL_SOURCE_1 "pre_process_yuv444_scale",
#define KERNEL_SOURCE_3 "pre_process_yuv444_scale_fp16",
@ -75,6 +76,7 @@ static const struct {
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, SCALE, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I8, SCALE, KERNEL_SOURCE_1)
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8, COPY, KERNEL_SOURCE_4)
TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, F16, COPY, KERNEL_SOURCE_4)
};
static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] =
@ -145,10 +147,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
}
shaderParam.global_scale[0] = 16;
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
{
shaderParam.global_scale[0] = 8;
}
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
@ -400,6 +398,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
switch( attr[0]->dtype )
{
case U8:
case F16:
{
// R
status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4);
@ -841,7 +840,7 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (enable_copy && output_dtype == U8)
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
{
convert_type = COPY;
}
@ -865,7 +864,7 @@ static vsi_status _query_kernel
kernel->info.parameters = vxPreProcessYuv444Kernel_param_def;
kernel->info.numParams = _cnt_of_array( vxPreProcessYuv444Kernel_param_def );
if (enable_copy && output_dtype == U8)
if (enable_copy && (output_dtype == U8 || output_dtype == F16))
{
kernel->info.initialize = _pre_process_yuv444_copy_initializer;
}

View File

@ -0,0 +1,609 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define KERNEL_SOURCE_1 "repeat"
#define KERNEL_SOURCE_2 "repeat_axis1"
#define HASH_PREPROCESS_STARTID_SH_KERNEL_NAME \
CVIVANTE_NAMESPACE("evis.preprocess_start_idx")
#define HASH_REPEAT_SH_KERNEL_1D_NAME(SRC0_TYPE) \
CVIVANTE_NAMESPACE("evis.repeat_"#SRC0_TYPE"_1D")
#define HASH_REPEAT_SH_KERNEL_NAME(SRC0_TYPE, AXIS) \
CVIVANTE_NAMESPACE("evis.repeat_"#SRC0_TYPE"_axis"#AXIS)
// Add kernel hashtable here
#define HASH_PREPROCESS_KEY(_input0_type, _output_type) \
((_input0_type << 24) | (_output_type << 16))
#define HASH_REPEAT_KEY(_input0_type, _output_type, _is1d, _axis) \
((_input0_type << 24) | (_output_type << 16) | (_is1d << 8) | _axis)
#define TENSOR_PREPROCESS_STARTID_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_PREPROCESS_KEY(IN0_TYPE, OUT_TYPE), \
HASH_PREPROCESS_STARTID_SH_KERNEL_NAME, \
SOURCE },
#define TENSOR_REPEAT_KERNELS(IN0_TYPE, OUT_TYPE, AXIS, SOURCE) \
{ HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 0, AXIS), \
HASH_REPEAT_SH_KERNEL_NAME(IN0_TYPE, AXIS), \
SOURCE },
#define TENSOR_REPEAT_1D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 1, 0), \
HASH_REPEAT_SH_KERNEL_1D_NAME(IN0_TYPE), \
SOURCE },
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _preprocess_kernel_map[] =
{
// Register kernel here
TENSOR_PREPROCESS_STARTID_KERNELS( I32, I32, KERNEL_SOURCE_1 )
};
static const _kernel_map_type _repeat_kernel_map[] =
{
// Register kernel here
TENSOR_REPEAT_KERNELS( U8, U8, 0, KERNEL_SOURCE_1 )
TENSOR_REPEAT_KERNELS( U8, U8, 1, KERNEL_SOURCE_2 )
TENSOR_REPEAT_KERNELS( U8, U8, 2, KERNEL_SOURCE_1 )
TENSOR_REPEAT_KERNELS( I16, I16, 0, KERNEL_SOURCE_1 )
TENSOR_REPEAT_KERNELS( I16, I16, 1, KERNEL_SOURCE_2 )
TENSOR_REPEAT_KERNELS( I16, I16, 2, KERNEL_SOURCE_1 )
TENSOR_REPEAT_1D_KERNELS( U8, U8, KERNEL_SOURCE_1 )
TENSOR_REPEAT_1D_KERNELS( I16, I16, KERNEL_SOURCE_1 )
};
/*
* Kernel params
*/
static vx_param_description_t _preprocess_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _REPEAT_PREPROCESS_PARAM_NUM _cnt_of_array( _preprocess_kernel_param_def )
static vx_param_description_t _repeat_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _REPEAT_PARAM_NUM _cnt_of_array( _repeat_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_preprocess_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
int32_t width = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
width = attr[0]->shape->data[0];
shaderParam.global_scale[0] = 16;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
shaderParam.local_size[0] = 32;
shaderParam.local_size[1] = 1;
shaderParam.local_size[2] = 1;
shaderParam.global_size[0] = 32;
shaderParam.global_size[1] = 1;
shaderParam.global_size[2] = 1;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
gpu_dp_inst_t uniIntegralHorAcc_4x4 = {{
0xff3f0f03, // TCfg
0x00000000, // ASelt
0x00100000, 0x32100210, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param(node, "uniIntegralHorAcc_4x4", &uniIntegralHorAcc_4x4);
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
}
DEF_KERNEL_INITIALIZER(_repeat_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t shaderParam = {
3, // workdim
{0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
{0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
{0, 0, 0}, // localWorkSize: local group size in thread
{0, 0, 0}}; // globalWorkSize: image size in thread
vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
vsi_int_array_t * input_shape = NULL;
int32_t height = 0, width = 0, chn = 0;
int32_t is1d = 0;
int32_t axis = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
CHECK_STATUS_FAIL_GOTO(status, OnError );
input_shape = attr[0]->shape;
width = input_shape->data[0];
height = input_shape->data[1];
if (height == 1 && input_shape->size == 2)
{
is1d = 1;
}
chn = input_shape->size > 2 ? input_shape->data[2] : 1;
if ((axis == 0 && is1d == 0) || axis == 2)
{
shaderParam.global_scale[0] = 16;
if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
{
shaderParam.global_scale[0] = 8;
}
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
}
else if (is1d)
{
shaderParam.global_scale[0] = 1;
shaderParam.global_scale[1] = 1;
shaderParam.global_scale[2] = 1;
}
else if (axis == 1)
{
shaderParam.global_scale[0] = 1;
shaderParam.global_scale[1] = 8;
shaderParam.global_scale[2] = 1;
}
shaderParam.global_size[0] = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = (height + shaderParam.global_scale[1] - 1)
/ shaderParam.global_scale[1];
shaderParam.global_size[2] = chn;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
{
gpu_dp_inst_t uniExtract1to8Short_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x00000000, 0x00000000, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param(node, "uniExtract1to8Short_2x8", &uniExtract1to8Short_2x8);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
OnError:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
}
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_tensor_t* const* const inputs,
vsi_nn_tensor_t* const* const outputs,
vsi_nn_kernel_t* kernel_preprocess,
vsi_nn_kernel_t* kernel,
int32_t axis
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e input1_dtype = I32;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (input0_dtype == F16)
{
input0_dtype = I16;
}
if (output_dtype == F16)
{
output_dtype = I16;
}
if (input0_dtype == I8)
{
input0_dtype = U8;
}
if (output_dtype == I8)
{
output_dtype = U8;
}
key = HASH_PREPROCESS_KEY( input1_dtype, I32 );
for( i = 0; i < _cnt_of_array(_preprocess_kernel_map); i ++ )
{
if ( _preprocess_kernel_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(_preprocess_kernel_map) )
{
snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s", _preprocess_kernel_map[i].function_name );
kernel_preprocess->info.parameters = _preprocess_kernel_param_def;
kernel_preprocess->info.numParams = _REPEAT_PREPROCESS_PARAM_NUM;
kernel_preprocess->info.initialize = _preprocess_initializer;
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
_preprocess_kernel_map[i].source_name );
vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
_preprocess_kernel_map[i].source_name );
}
key = HASH_REPEAT_KEY( input0_dtype, output_dtype, is1d, axis );
for( i = 0; i < _cnt_of_array(_repeat_kernel_map); i ++ )
{
if ( _repeat_kernel_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(_repeat_kernel_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _repeat_kernel_map[i].function_name );
kernel->info.parameters = _repeat_kernel_param_def;
kernel->info.numParams = _REPEAT_PARAM_NUM;
kernel->info.initialize = _repeat_initializer;
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
_repeat_kernel_map[i].source_name );
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
_repeat_kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static int32_t _optimize_repeat_shape
(
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs,
int32_t* axis,
int32_t* opt_shape_in,
int32_t* opt_shape_out,
int32_t* new_rank
)
{
vsi_status status = VSI_SUCCESS;
if (inputs[0]->attr.dim_num == 1)
{
opt_shape_in[0] = inputs[0]->attr.size[0];
opt_shape_in[1] = 1;
opt_shape_out[0] = outputs[0]->attr.size[0];
opt_shape_out[1] = 1;
new_rank[0] = 2;
new_rank[1] = 2;
}
else if (axis[0] == 3)
{
vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank );
if (opt_shape_in[1] == 1)
{
opt_shape_in[1] = inputs[0]->attr.size[3];
opt_shape_out[0] = opt_shape_in[0];
opt_shape_out[1] = outputs[0]->attr.size[3];
axis[0] = 0;
new_rank[0] = 2;
new_rank[1] = 2;
}
else if (new_rank[0] == 2)
{
opt_shape_in[2] = inputs[0]->attr.size[3];
opt_shape_out[0] = opt_shape_in[0];
opt_shape_out[1] = opt_shape_in[1];
opt_shape_out[2] = outputs[0]->attr.size[3];
axis[0] = 2;
new_rank[0] = 3;
new_rank[1] = 3;
}
else
{
status = VSI_FAILURE;
}
}
return status;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t preprocess_node_params[_REPEAT_PREPROCESS_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_param_t node_params[_REPEAT_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t tmp_node = NULL;
vsi_nn_kernel_node_t node = NULL;
vsi_nn_tensor_attr_t attr;
vsi_nn_kernel_t * kernel_preprocess = NULL;
vsi_nn_tensor_t * tensor_preprocess = NULL;
vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL;
int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
int32_t new_rank[2] = {0, 0};
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
// Check if gpu can support the size
if ( !vsi_nn_kernel_gpu_check_shape(
(int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
{
return NULL;
}
if (axis > 2 || outputs[0]->attr.dim_num == 1)
{
status = _optimize_repeat_shape(inputs, outputs, &axis, new_shape[0], new_shape[1], new_rank);
if ( VSI_SUCCESS != status )
{
goto final;
}
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], new_rank[0]);
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], new_rank[1]);
}
if (inputs[1]->attr.dim_num == 1)
{
new_shape[0][0] = inputs[1]->attr.size[0];
new_shape[0][1] = 1;
rs_input1 = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[0], 2);
}
kernel_preprocess = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
// Assign unique_id
kernel_preprocess->unique_id = kernel->unique_id;
status = _query_kernel( inputs, outputs, kernel_preprocess, kernel, axis );
if ( VSI_SUCCESS != status )
{
goto final;
}
memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
attr.dtype.vx_type = VSI_NN_TYPE_INT32;
attr.is_const = FALSE;
attr.vtl = TRUE;
attr.size[0] = inputs[1]->attr.size[0];
attr.size[1] = 1;
attr.dim_num = 2;
tensor_preprocess = vsi_nn_CreateTensor( graph, &attr );
// preprocess
tmp_node = vsi_nn_kernel_create_node( graph, kernel_preprocess );
if (tmp_node)
{
uint32_t index = 0;
if (rs_input1)
{
preprocess_node_params[index++] = rs_input1;
}
else
{
preprocess_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
}
preprocess_node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_preprocess->t;
status = vsi_nn_kernel_node_pass_param( tmp_node, preprocess_node_params,
_REPEAT_PREPROCESS_PARAM_NUM );
CHECK_STATUS(status);
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
border.constant_value.U8 = 0;
border.constant_value.U16 = 0;
border.constant_value.S32 = 0;
if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
{
border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
}
status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
CHECK_STATUS(status);
}
}
// repeat
node = vsi_nn_kernel_create_node( graph, kernel );
if (node)
{
uint32_t index = 0;
if (rs_input)
{
node_params[index++] = rs_input;
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
}
if (rs_input1)
{
node_params[index++] = rs_input1;
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
}
node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_preprocess->t;
if (rs_output)
{
node_params[index++] = rs_output;
}
else
{
node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
}
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
status = vsi_nn_kernel_node_pass_param( node, node_params,
_REPEAT_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_scalar_release( &node_params[4] );
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_REPLICATE;
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
CHECK_STATUS(status);
}
}
/* Pass parameters to node. */
final:
if (rs_input)
{
vsi_nn_kernel_tensor_release( &rs_input );
}
if (rs_input1)
{
vsi_nn_kernel_tensor_release( &rs_input1 );
}
if (rs_output)
{
vsi_nn_kernel_tensor_release( &rs_output );
}
if ( kernel_preprocess )
{
vsi_nn_kernel_release( &kernel_preprocess );
}
if ( tensor_preprocess )
{
vsi_nn_ReleaseTensor( &tensor_preprocess );
}
if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( repeat, _setup )

View File

@ -49,11 +49,13 @@ typedef enum
UP,
UP_OPT,
UP_2X_HALF,
UP_3X_HALF,
UP_4X_HALF,
} _internal_scale_e;
#define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_bilinear_"#_input_type
#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(_input_type) "resize_bilinear_"#_input_type"_UP_2X"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers"
#define STR(a) #a
// Add kernel hashtable here
@ -77,8 +79,21 @@ typedef enum
#define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_2X_half"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(IN_DTYPE) }
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_2x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_4x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_3x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
typedef struct
{
@ -103,6 +118,8 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
PACK_KERNEL_MAP_UP(BF16, BF16),
PACK_KERNEL_MAP_UP_OPT(U8, U8),
PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
};
@ -203,8 +220,10 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
uint32_t out_height;
float half_pixel_value = 0.0f;
vsi_bool is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size);
vsi_bool is_use_2x_up_half_kernel = FALSE;
vsi_bool is_half_pixel_centers = FALSE;
vsi_bool is_2x_up_kernel = FALSE;
vsi_bool is_3x_up_kernel = FALSE;
vsi_bool is_4x_up_kernel = FALSE;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@ -254,11 +273,13 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
half_pixel_value = 0.0f;
}
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
is_half_pixel_centers = (!align_corners) && (half_pixel_centers);
if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)) && is_half_pixel_centers)
{
is_use_2x_up_half_kernel = (!align_corners) && (half_pixel_centers);
is_use_2x_up_half_kernel = is_use_2x_up_half_kernel && \
(2 * in_width == out_width) && (2 * in_height == out_height);
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
}
if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
@ -309,11 +330,17 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
outputZP = 0;
}
if (is_use_2x_up_half_kernel)
if (is_2x_up_kernel || is_4x_up_kernel)
{
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[0] = 16;
gpu_param.global_scale[1] = 1;
}
else if (is_3x_up_kernel)
{
gpu_param.global_scale[0] = 15;
gpu_param.global_scale[1] = 6;
gpu_param.global_scale[2] = 1;
}
else
{
gpu_param.global_scale[0] = 4;
@ -321,28 +348,134 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
gpu_param.global_scale[2] = 1;
}
if (is_use_2x_up_half_kernel)
if (is_2x_up_kernel)
{
gpu_dp_inst_t uniResize2xUp_4x8 = {{
gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
0x00000400, // AccumType, ConstantType, and PostShift
0x00000704, // AccumType, ConstantType, and PostShift
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize2xUpRound_2x8 = {{
0x55555555, // TCfg
0x44444444, // ASelt
0x03020100, 0x07060504, // ABin
0xaaaaaaaa, // BSelt
0x00000000, 0x00000000, // BBin
gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
0x00000704, // AccumType, ConstantType, and PostShift
0x00010001, 0x00010001, 0x00010001, 0x00010001,
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
0x09030301, 0x03090103, 0x09030301, 0x03090103,
0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_4x8", &uniResize2xUp_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUpRound_2x8", &uniResize2xUpRound_2x8);
status = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_3x_up_kernel)
{
gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
0x15515515, // TCfg
0x00000000, // ASelt
0x21210110, 0x03323202, // ABin
0x2aa2aa2a, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
0x05155155, // TCfg
0x00000000, // ASelt
0x54044343, 0x00650554, // ABin
0x0a2aa2aa, // BSelt
0x00000000, 0x00000000, // BBin
0x00000610, // AccumType, ConstantType, and PostShift
0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
0x55551155, // TCfg
0x50501050, // ASelt
0x01011010, 0x21212121, // ABin
0xaaaa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
0x11555511, // TCfg
0x10505010, // ASelt
0x32320202, 0x03033232, // ABin
0x22aaaa22, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
0x55115555, // TCfg
0x50105050, // ASelt
0x43434343, 0x54540404, // ABin
0xaa22aaaa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
0x00551155, // TCfg
0x00501050, // ASelt
0x05055454, 0x00006565, // ABin
0x00aa22aa, // BSelt
0x00000000, 0x00000000, // BBin
0x0000060f, // AccumType, ConstantType, and PostShift
0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_4x_up_kernel)
{
gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
0x00000406, // AccumType, ConstantType, and PostShift
0x23150503, 0x31070701, 0x07310107, 0x15230305,
0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
@ -832,13 +965,13 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
goto final;
}
if (!is_use_2x_up_half_kernel)
if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel)
{
status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
CHECK_STATUS_FAIL_GOTO(status, final );
}
if (is_use_2x_up_half_kernel)
if (is_2x_up_kernel || is_4x_up_kernel)
{
gpu_param.global_size[0] = gpu_align_p2((out_width + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
@ -860,8 +993,6 @@ final:
return status;
} /* _resize_bilinear_initializer() */
/*
* Query kernel
*/
@ -872,7 +1003,8 @@ static vsi_status _query_kernel
vsi_nn_tensor_t * const * const outputs,
vsi_bool is_same_type,
vsi_bool is_evis2,
vsi_bool is_2x_up_half,
int32_t align_corners,
int32_t half_pixel_centers,
vsi_bool *is_run_opt_kernel
)
{
@ -886,17 +1018,35 @@ static vsi_status _query_kernel
vx_kernel_initialize_f initializer = _resize_bilinear_initializer;
uint32_t key;
uint32_t i;
vsi_bool is_2x_upsample =(2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
vsi_bool is_3x_upsample =(3 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
_internal_scale_e scale_flag = UP;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
is_2x_upsample &= (in_dtype == U8);
is_3x_upsample &= (in_dtype == U8);
is_4x_upsample &= (in_dtype == U8);
if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
{
if (is_2x_up_half)
if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
{
scale_flag = UP_2X_HALF;
}
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
{
scale_flag = UP_3X_HALF;
}
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
{
scale_flag = UP_4X_HALF;
}
else if (is_same_type && is_evis2)
{
scale_flag = UP_OPT;
@ -920,19 +1070,6 @@ static vsi_status _query_kernel
}
}
if ((UP_2X_HALF == scale_flag) && (i >= kernel_map_size) && is_same_type && is_evis2)
{
scale_flag = UP_OPT;
key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if( kernel_map[i].key == key )
{
break;
}
}
}
if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
{
scale_flag = UP;
@ -1109,9 +1246,6 @@ OnError:
return scale;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
@ -1131,14 +1265,10 @@ static vsi_nn_kernel_node_t _setup
vsi_bool is_same_type = vsi_nn_is_same_type(inputs[0], outputs[0]);
vsi_bool is_evis2 = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
vsi_bool is_run_opt_kernel = FALSE;
vsi_bool is_2x_up_half = FALSE;
vsi_nn_tensor_t* scale = NULL;
is_2x_up_half = is_same_type && (!align_corners) && (half_pixel_centers);
is_2x_up_half = is_2x_up_half && (2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
status = _query_kernel( kernel, inputs, outputs, is_same_type, is_evis2,
is_2x_up_half, &is_run_opt_kernel);
align_corners, half_pixel_centers, &is_run_opt_kernel);
if( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );

Some files were not shown because too many files have changed in this diff Show More