Pre-release for 22Q1 (#302)

update internal to commit-id: d45da6fa

Co-authored-by: zhouheng.zheng <zhouheng.zheng@ouotlook.com>
This commit is contained in:
Zhouheng Zheng 2022-03-01 17:56:03 +08:00 committed by GitHub
parent e63059857b
commit 161bb8a7c4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
149 changed files with 12641 additions and 970 deletions

View File

@ -2,3 +2,6 @@
custom op data struct def
*/
DEF_NODE_TYPE(custom_softmax)
DEF_NODE_TYPE(custom_ainr_denoise_postprocess)
DEF_NODE_TYPE(custom_warp_affine)
DEF_NODE_TYPE(custom_warp_perspective)

View File

@ -2,3 +2,6 @@
Add custom ops to the end.
*/
DEF_OP(CUSTOM_SOFTMAX)
DEF_OP(CUSTOM_AINR_DENOISE_POSTPROCESS)
DEF_OP(CUSTOM_WARP_AFFINE)
DEF_OP(CUSTOM_WARP_PERSPECTIVE)

View File

@ -0,0 +1,47 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_CUSTOM_AINR_DENOISE_POSTPROCESS_H
#define _VSI_NN_OP_CUSTOM_AINR_DENOISE_POSTPROCESS_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_custom_ainr_denoise_postprocess_param
{
struct _ainr_denoise_postprocess_local_data_t* local;
// Add parameters here
} vsi_nn_custom_ainr_denoise_postprocess_param;
_compiler_assert(offsetof(vsi_nn_custom_ainr_denoise_postprocess_param, local) == 0, \
vsi_nn_custom_ainr_denoise_postprocess_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,49 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_CUSTOM_WARP_AFFINE_H
#define _VSI_NN_OP_CUSTOM_WARP_AFFINE_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_custom_warp_affine_param
{
struct _custom_warp_affine_local_data_t* local;
// Add parameters here
const float *matrix;
vsi_enum type;
int32_t size[2];
} vsi_nn_custom_warp_affine_param;
_compiler_assert(offsetof(vsi_nn_custom_warp_affine_param, local) == 0, \
vsi_nn_custom_warp_affine_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,50 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_CUSTOM_WARP_PERSPECTIVE_H
#define _VSI_NN_OP_CUSTOM_WARP_PERSPECTIVE_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_custom_warp_perspective_param
{
struct _custom_warp_perspective_local_data_t* local;
// Add parameters here
const float *matrix;
vsi_enum type;
int32_t size[2];
} vsi_nn_custom_warp_perspective_param;
_compiler_assert(offsetof(vsi_nn_custom_warp_perspective_param, local) == 0, \
vsi_nn_custom_warp_perspective_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -27,5 +27,8 @@
custom op head files
*/
#include "custom/ops/vsi_nn_op_custom_softmax.h"
#include "custom/ops/vsi_nn_op_custom_ainr_denoise_postprocess.h"
#include "custom/ops/vsi_nn_op_custom_warp_affine.h"
#include "custom/ops/vsi_nn_op_custom_warp_perspective.h"
#endif

View File

@ -165,3 +165,6 @@ DEF_OP(GRUCELL)
DEF_OP(GRUCELL_ACTIVATION)
DEF_OP(RESHAPE2)
DEF_OP(CONV3D)
DEF_OP(DECONV3D)
DEF_OP(PAD2)
DEF_OP(COS)

View File

@ -19,3 +19,4 @@ DEF_OP(RESIZE_1D_NEAREST_INTERNAL)
DEF_OP(SPACE2DEPTH_INTERNAL)
DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
DEF_OP(GRUCELL_ACTIVATION_Z_H)
DEF_OP(REDUCE_MEAN_INTERNAL)

View File

@ -640,6 +640,13 @@ vsi_nn_kernel_node_t vsi_nn_kernel_create_node
vsi_nn_kernel_t * kernel
);
vsi_nn_kernel_node_t vsi_nn_kernel_create_node_ext
(
vsi_nn_graph_t * graph,
vsi_nn_kernel_t * kernel,
const char** resources
);
vsi_status vsi_nn_kernel_node_set_border
(vsi_nn_kernel_node_t node,
vx_border_t* border);
@ -720,6 +727,13 @@ vsi_status vsi_nn_kernel_register
vsi_nn_kernel_t * kernel
);
vsi_status vsi_nn_kernel_register_ext
(
vsi_nn_graph_t * graph,
vsi_nn_kernel_t * kernel,
const char** resources
);
vsi_bool vsi_nn_kernel_gpu_check_shape
( const vsi_size_t * shape, vsi_size_t rank );

View File

@ -79,4 +79,10 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
vsi_size_t* out_shape, uint32_t* out_rank
);
vsi_bool vsi_nn_kernel_optimize_group_norm_shape
(
const vsi_size_t* shape, const uint32_t rank, int32_t groups,
int32_t is_sp_kernel, vsi_size_t* out_shape
);
#endif

View File

@ -0,0 +1,54 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_DECONV3D_H
#define _VSI_NN_OP_DECONV3D_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_deconv3d_param
{
struct _deconv3d_local_data_t* local;
// Add parameters here
uint32_t ksize[3];
uint32_t stride[3];
/* Pad left, right, top, bottom, front, rear */
uint32_t pad[6];
uint32_t weights;
uint32_t group;
uint32_t output_padding[3];
} vsi_nn_deconv3d_param;
_compiler_assert(offsetof(vsi_nn_deconv3d_param, local) == 0, \
vsi_nn_deconv3d_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -41,6 +41,7 @@ typedef struct _vsi_nn_gather_param
{
vsi_nn_gather_lcl_data local;
int32_t axis;
int32_t batch_dims;
} vsi_nn_gather_param;
#ifdef __cplusplus

View File

@ -0,0 +1,50 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_PAD2_H
#define _VSI_NN_OP_PAD2_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_pad2_param
{
struct _pad2_local_data_t* local;
const uint32_t * front_size;
const uint32_t * back_size;
uint8_t dim_num;
float const_val;
vsi_nn_pad_mode_e mode;
} vsi_nn_pad2_param;
_compiler_assert(offsetof(vsi_nn_pad2_param, local) == 0, \
vsi_nn_pad2_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -51,7 +51,7 @@ typedef struct _vsi_nn_reduce_param
{
/* local data must be the first. */
vsi_nn_reduce_lcl_data_t local;
vx_enum type;
vsi_enum type;
const int32_t *axis;
vx_uint32 axis_num;
vx_bool keep_dim;

View File

@ -0,0 +1,49 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#ifndef _VSI_NN_OP_REDUCE_MEAN_INTERNAL_H
#define _VSI_NN_OP_REDUCE_MEAN_INTERNAL_H
#include "vsi_nn_types.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _vsi_nn_reduce_mean_internal_param
{
struct _reduce_mean_internal_local_data_t* local;
// Add parameters here
vx_int32 *axis;
vx_uint32 axis_num;
float scale;
} vsi_nn_reduce_mean_internal_param;
_compiler_assert(offsetof(vsi_nn_reduce_mean_internal_param, local) == 0, \
vsi_nn_reduce_mean_internal_h );
#ifdef __cplusplus
}
#endif
#endif

View File

@ -28,6 +28,7 @@
/*-------------------------------------------
Includes
-------------------------------------------*/
#include <stdio.h>
#include "vsi_nn_platform.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_types.h"
@ -398,6 +399,31 @@ void vsi_nn_get_tensor_clamp_min_max
float *clampMax
);
char* vsi_nn_strncpy
(
char* dest,
const char* source,
size_t count
);
char* vsi_nn_strncat
(
char* dest,
const char* source,
size_t count
);
char* vsi_nn_getenv
(
const char * var_name
);
FILE* vsi_nn_fopen
(
const char * file_name,
const char * mode
);
#ifdef __cplusplus
}
#endif

View File

@ -71,6 +71,17 @@ OVXLIB_API void vsi_nn_OpRemoveClient
vsi_nn_op_t op
);
vsi_bool vsi_nn_OpAddClientName
(
vsi_nn_op_t op,
const char* kernel_name
);
const char* vsi_nn_OpGetClientName
(
vsi_nn_op_t op
);
#if defined(__cplusplus)
}
#endif

View File

@ -73,6 +73,7 @@ typedef struct _vsi_nn_runtime_option_t
int32_t enable_shader;
int32_t enable_opcheck;
int32_t enable_concat_optimize;
int32_t enable_asymi8_to_u8;
} vsi_nn_runtime_option_t;
/**

View File

@ -1,26 +1,3 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the Software),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
/*****Auto generated header file, Please DO NOT modify manually!*****/
#ifndef _VSI_NN_FEATURE_CONFIG_H
#define _VSI_NN_FEATURE_CONFIG_H

View File

@ -456,6 +456,29 @@ OVXLIB_API vsi_nn_node_t * vsi_nn_AddNode
vsi_nn_node_id_t * node_id
);
/**
* Add External node
* Create a new External node and attach it to graph.
*
* @param[in] graph Graph handle
* @param[in] op Node operation.
* @param[in] vsi_nn_proc_t to this node.
* @param[in] output_num Number of outputs to this node.
* @param[in] kernel name.
* @param[out] node_id A handle to get the id of new node,
* pass it to NULL to get nothing.
*
* @return The node handle on success, or NULL otherwise.
*/
OVXLIB_API vsi_nn_node_t * vsi_nn_AddExternalNode
(
vsi_nn_graph_t * graph,
vsi_nn_op_t op,
const void * proc,
vsi_nn_node_id_t * node_id,
const char *kernel_name
);
/**
* @deprecated
* @see vsi_nn_AddNode

View File

@ -24,14 +24,18 @@
#ifndef _VSI_NN_LOG_H
#define _VSI_NN_LOG_H
#include <stdio.h>
#include "utils/vsi_nn_util.h"
#if defined(__cplusplus)
extern "C"{
#endif
#ifdef _MSC_VER
#define snprintf _snprintf
#define snprintf(buffer, count, format, ...) \
_snprintf_s(buffer, count, _TRUNCATE, format, ##__VA_ARGS__)
#define vsnprintf(buffer, count, format, args) \
_vsnprintf_s(buffer, count, _TRUNCATE, format, args)
#endif
typedef enum _vsi_nn_log_level_e
@ -68,4 +72,3 @@ OVXLIB_API void vsi_nn_LogMsg
#endif
#endif

View File

@ -182,6 +182,9 @@
#include "ops/vsi_nn_op_conv3d.h"
#include "ops/vsi_nn_op_grucell_h_times_activation_r.h"
#include "ops/vsi_nn_op_grucell_activation_z_h.h"
#include "ops/vsi_nn_op_deconv3d.h"
#include "ops/vsi_nn_op_reduce_mean_internal.h"
#include "ops/vsi_nn_op_pad2.h"
/* custom node head define define */
#include "custom/vsi_nn_custom_node_type.h"
@ -350,7 +353,10 @@ typedef union _vsi_nn_nn_param
vsi_nn_conv3d_param conv3d;
vsi_nn_grucell_h_times_activation_r_param grucell_h_times_activation_r;
vsi_nn_grucell_activation_z_h_param grucell_activation_z_h;
uint8_t client_param[128];
vsi_nn_deconv3d_param deconv3d;
vsi_nn_reduce_mean_internal_param reduce_mean_internal;
vsi_nn_pad2_param pad2;
void* client_param;
/* custom node data struct define */
#define DEF_NODE_TYPE( NAME ) vsi_nn_##NAME##_param NAME;

View File

@ -48,7 +48,7 @@ extern "C"{
* @see include/custom/custom_ops.def
* @see include/internal/internal_ops.def
*/
typedef uint32_t vsi_nn_op_t; enum
typedef int32_t vsi_nn_op_t; enum
{
#define DEF_OP( NAME, ... ) VSI_NN_OP_##NAME,
#include "interface/ops.def"
@ -317,6 +317,13 @@ vsi_bool vsi_nn_OpRegisterOvxInit
vsi_nn_op_compute_t compute
);
vsi_bool vsi_nn_OpRegisterExternalOvxInit
(
vsi_nn_op_t op,
const char* kernel_name,
vsi_nn_op_proc_t* proc
);
/**
* Get operation name
* Get operation name string by operation id.

View File

@ -33,7 +33,7 @@ extern "C"{
#define VSI_NN_VERSION_MAJOR 1
#define VSI_NN_VERSION_MINOR 1
#define VSI_NN_VERSION_PATCH 37
#define VSI_NN_VERSION_PATCH 39
#define VSI_NN_VERSION \
(VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

View File

@ -77,7 +77,7 @@ DEF_KERNEL_EXECUTOR(_softmax_exec)
/* alloc the float32 data buffer */
buffer[1] = (float *)malloc(out_elements * sizeof(float));
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
memset(buffer[1], 0, out_elements * sizeof(float));
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );

View File

@ -0,0 +1,296 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.custom_warp_affine")
/*
* Kernel params
*/
static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def )
#define SCALAR_INPUT_TYPE (2)
#define SCALAR_MATRIX_OFFSET (3)
static void _transform_affine
(
vsi_size_t dst_x,
vsi_size_t dst_y,
const float m[],
float *src_x,
float *src_y
)
{
*src_x = dst_x * m[0] + dst_y * m[2] + m[4];
*src_y = dst_x * m[1] + dst_y * m[3] + m[5];
}
static vsi_bool _read_pixel
(
float *base,
vsi_nn_kernel_tensor_attr_t *attr,
float x,
float y,
float *pixel
)
{
vsi_size_t width = attr->shape->data[0];
vsi_size_t height = attr->shape->data[1];
vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height);
vsi_size_t bx = 0, by = 0;
if (out_of_bounds)
{
*pixel = 205.0f;
return TRUE;
}
// bounded x/y
bx = x < 0 ? 0 : x >= width ? width - 1 : (vsi_size_t)x;
by = y < 0 ? 0 : y >= height ? height - 1 : (vsi_size_t)y;
*pixel = base[by * width + bx];
return TRUE;
}
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
float* buffer[_CPU_IO_NUM] = { NULL };
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
int32_t type = 0;
float matrix[6] = {0};
vsi_size_t i = 0;
vsi_size_t b = 0;
vsi_size_t x = 0;
vsi_size_t y = 0;
vsi_size_t out_elements = 0;
vsi_size_t width = 0;
vsi_size_t height = 0;
vsi_size_t outer_size = 1;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
/* alloc the float32 data buffer */
buffer[1] = (float *)malloc(out_elements * sizeof(float));
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
memset(buffer[1], 0, out_elements * sizeof(float));
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
&type);
CHECK_STATUS_FAIL_GOTO(status, final );
for (i = 0; i < 6; i++)
{
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
&matrix[i]);
CHECK_STATUS_FAIL_GOTO(status, final );
}
width = attr[1]->shape->data[0];
height = attr[1]->shape->data[1];
for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i)
{
outer_size *= attr[1]->shape->data[i];
}
// Do something
for (b = 0; b < outer_size; b++)
{
float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
float *dst_base = buffer[1] + b * width * height;
for (y = 0; y < height; y++)
{
for (x = 0; x < width; x++)
{
float xf = 0;
float yf = 0;
float dst = 0;
_transform_affine(x, y, matrix, &xf, &yf);
if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
{
_read_pixel(src_base, attr[0], xf, yf, &dst);
dst_base[y * width + x] = dst;
}
else
{
float tl = 0, tr = 0, bl = 0, br = 0;
float ar = xf - floorf(xf);
float ab = yf - floorf(yf);
float al = 1.0f - ar;
float at = 1.0f - ab;
_read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
_read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
_read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
_read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
}
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
buffer[1], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if( buffer[i] )
{
free( buffer[i] );
}
vsi_nn_kernel_tensor_attr_release( &attr[i] );
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _custom_warp_affine_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _custom_warp_affine_kernel_param_def );
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_AFFINE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
size_t i = 0;
size_t buffer_size = 0;
int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[SCALAR_INPUT_TYPE] = vsi_nn_kernel_scalar_create(
graph, I32, &type );
for (i = 0; i < buffer_size; i++)
{
node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
graph, F32, &buffer[i] );
}
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TYPE] );
for (i = 0; i < buffer_size; i++)
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
}
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( custom_warp_affine, _setup )

View File

@ -0,0 +1,300 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
#define _CPU_IO_NUM (_INPUT_NUM + _OUTPUT_NUM)
#define _KERNEL_NAME CVIVANTE_NAMESPACE("cpu.custom_warp_perspective")
/*
* Kernel params
*/
static vx_param_description_t _custom_warp_perspective_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM _cnt_of_array( _custom_warp_perspective_kernel_param_def )
#define SCALAR_INPUT_TYPE (2)
#define SCALAR_MATRIX_OFFSET (3)
static void _transform_perspective
(
vsi_size_t dst_x,
vsi_size_t dst_y,
const float m[],
float *src_x,
float *src_y
)
{
float z = dst_x * m[2] + dst_y * m[5] + m[8];
*src_x = (dst_x * m[0] + dst_y * m[3] + m[6]) / z;
*src_y = (dst_x * m[1] + dst_y * m[4] + m[7]) / z;
}
static vsi_bool _read_pixel
(
float *base,
vsi_nn_kernel_tensor_attr_t *attr,
float x,
float y,
float *pixel
)
{
vsi_size_t width = attr->shape->data[0];
vsi_size_t height = attr->shape->data[1];
vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= width || y >= height);
vsi_size_t bx = 0, by = 0;
if (out_of_bounds)
{
*pixel = 205.0f;
return TRUE;
}
// bounded x/y
bx = x < 0 ? 0 : x >= width ? width - 1 : (vsi_size_t)x;
by = y < 0 ? 0 : y >= height ? height - 1 : (vsi_size_t)y;
*pixel = base[by * width + bx];
return TRUE;
}
/*
* Kernel function
*/
DEF_KERNEL_EXECUTOR(_compute)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
float* buffer[_CPU_IO_NUM] = { NULL };
vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
int32_t type = 0;
float matrix[9] = {0};
vsi_size_t i = 0;
vsi_size_t b = 0;
vsi_size_t x = 0;
vsi_size_t y = 0;
vsi_size_t out_elements = 0;
vsi_size_t width = 0;
vsi_size_t height = 0;
vsi_size_t outer_size = 1;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
/* alloc the float32 data buffer */
buffer[1] = (float *)malloc(out_elements * sizeof(float));
CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
memset(buffer[1], 0, out_elements * sizeof(float));
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_TYPE],
&type);
CHECK_STATUS_FAIL_GOTO(status, final );
for (i = 0; i < 9; i++)
{
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
&matrix[i]);
CHECK_STATUS_FAIL_GOTO(status, final );
}
width = attr[1]->shape->data[0];
height = attr[1]->shape->data[1];
for(i = 2; i < (vsi_size_t)attr[1]->shape->size; ++i)
{
outer_size *= attr[1]->shape->data[i];
}
// Do something
for (b = 0; b < outer_size; b++)
{
float *src_base = buffer[0] + b * attr[0]->shape->data[0] * attr[0]->shape->data[1];
float *dst_base = buffer[1] + b * width * height;
for (y = 0; y < height; y++)
{
for (x = 0; x < width; x++)
{
float xf = 0;
float yf = 0;
float dst = 0;
_transform_perspective(x, y, matrix, &xf, &yf);
if (type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR)
{
_read_pixel(src_base, attr[0], xf, yf, &dst);
dst_base[y * width + x] = dst;
}
else
{
float tl = 0, tr = 0, bl = 0, br = 0;
float ar = xf - floorf(xf);
float ab = yf - floorf(yf);
float al = 1.0f - ar;
float at = 1.0f - ab;
_read_pixel(src_base, attr[0], floorf(xf), floorf(yf), &tl);
_read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf), &tr);
_read_pixel(src_base, attr[0], floorf(xf), floorf(yf) + 1, &bl);
_read_pixel(src_base, attr[0], floorf(xf) + 1, floorf(yf) + 1, &br);
dst_base[y * width + x] = tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab;
}
}
}
}
status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
buffer[1], out_elements );
CHECK_STATUS_FAIL_GOTO( status, final );
final:
for( i = 0; i < _CPU_IO_NUM; i ++ )
{
if( buffer[i] )
{
free( buffer[i] );
}
vsi_nn_kernel_tensor_attr_release( &attr[i] );
}
return status;
} /* _compute() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
/* Add extra params */
)
{
vsi_status status = VSI_FAILURE;
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", _KERNEL_NAME );
kernel->info.function = _compute;
kernel->info.parameters = _custom_warp_perspective_kernel_param_def;
kernel->info.numParams = _cnt_of_array( _custom_warp_perspective_kernel_param_def );
status = VSI_SUCCESS;
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_PERSPECTIVE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
size_t i = 0;
size_t buffer_size = 0;
int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM,
inputs, input_num, outputs, output_num );
node_params[SCALAR_INPUT_TYPE] = vsi_nn_kernel_scalar_create(
graph, I32, &type );
for (i = 0; i < buffer_size; i++)
{
node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
graph, F32, &buffer[i] );
}
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TYPE] );
for (i = 0; i < buffer_size; i++)
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
}
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CPU( custom_warp_perspective, _setup )

View File

@ -0,0 +1,295 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum _custom_warp_affine_type_e
{
nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
bilinear = VSI_NN_INTERPOLATION_BILINEAR,
}custom_warp_affine_type_e;
#define _CUSTOM_WARP_AFFINE_KERNEL_SOURCE "custom_warp_affine"
// Add kernel hashtable here
#define CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \
(( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
_CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_AFFINE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_affine_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
_CUSTOM_WARP_AFFINE_KERNEL_SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _custom_warp_affine_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_KERNEL_MAP( U8, U8, bilinear ),
PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
};
/*
* Kernel params
*/
static vx_param_description_t _custom_warp_affine_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _CUSTOM_WARP_AFFINE_PARAM_NUM _cnt_of_array( _custom_warp_affine_kernel_param_def )
#define SCALAR_MATRIX_OFFSET (2)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_custom_warp_affine_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
vsi_size_array_t * out_shape = NULL;
float m[6] = {0};
float matrix0[4] = {0};
float matrix1[4] = {0};
float matrix4[4] = {0};
int32_t i = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
for (i = 0; i < 6; i++)
{
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
&m[i]);
CHECK_STATUS_FAIL_GOTO(status, final );
}
matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[2]; matrix0[3] = m[3];
matrix1[0] = m[4]; matrix1[1] = m[5];
matrix4[0] = m[0]; matrix4[1] = m[1]; matrix4[2] = m[0] * 2; matrix4[3] = m[1] * 2;
out_shape = attr[1]->shape;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_add_param( node,
"matrix0", &matrix0 );
status |= vsi_nn_kernel_gpu_add_param( node,
"matrix1", &matrix1 );
status |= vsi_nn_kernel_gpu_add_param( node,
"matrix4", &matrix4 );
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _custom_warp_affine_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t type
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _custom_warp_affine_kernel_map;
size_t kernel_map_size = _cnt_of_array( _custom_warp_affine_kernel_map );
vx_param_description_t * param_def = _custom_warp_affine_kernel_param_def;
vx_kernel_initialize_f initializer = _custom_warp_affine_initializer;
int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1;
uint32_t key = 0;
uint32_t i = 0;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = CUSTOM_WARP_AFFINE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _custom_warp_affine_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_AFFINE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
size_t i = 0;
size_t buffer_size = 0;
int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs, type );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM,
inputs, input_num, outputs, output_num );
for (i = 0; i < buffer_size; i++)
{
node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
graph, F32, &buffer[i] );
}
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_AFFINE_PARAM_NUM );
for (i = 0; i < buffer_size; i++)
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
}
// Set default border mode.
border.constant_value.U32 = 0xcdcdcdcd;
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
CHECK_STATUS(status);
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( custom_warp_affine, _setup )

View File

@ -0,0 +1,300 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
#include "utils/vsi_nn_dtype_util_prv.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
typedef enum _custom_warp_perspective_type_e
{
nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
bilinear = VSI_NN_INTERPOLATION_BILINEAR,
}custom_warp_perspective_type_e;
#define _CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE "custom_warp_perspective"
// Add kernel hashtable here
#define CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, IMG_2D ) \
(( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (TYPE << 16) | (IMG_2D << 20))
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 0 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_perspective_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE), \
_CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE }
#define PACK_2D_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, TYPE ) \
{ CUSTOM_WARP_PERSPECTIVE_HASH_KEY( IN_DTYPE, OUT_DTYPE, TYPE, 1 ), \
CVIVANTE_NAMESPACE("evis.custom_warp_perspective_"#TYPE"_"#IN_DTYPE"to"#IN_DTYPE"_2D"), \
_CUSTOM_WARP_PERSPECTIVE_KERNEL_SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _custom_warp_perspective_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_KERNEL_MAP( U8, U8, bilinear ),
PACK_2D_KERNEL_MAP( U8, U8, nearest_neighbor ),
PACK_2D_KERNEL_MAP( U8, U8, bilinear ),
};
/*
* Kernel params
*/
static vx_param_description_t _custom_warp_perspective_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
};
#define _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM _cnt_of_array( _custom_warp_perspective_kernel_param_def )
#define SCALAR_MATRIX_OFFSET (2)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_custom_warp_perspective_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
vsi_size_array_t * out_shape = NULL;
float m[9] = {0};
float matrix0[4] = {0};
float matrix1[4] = {0};
float matrix2[4] = {0};
float matrix4[4] = {0};
int32_t i = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
for (i = 0; i < 9; i++)
{
status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_MATRIX_OFFSET + i],
&m[i]);
CHECK_STATUS_FAIL_GOTO(status, final );
}
matrix0[0] = m[0]; matrix0[1] = m[1]; matrix0[2] = m[3]; matrix0[3] = m[4];
matrix1[0] = m[6]; matrix1[1] = m[7]; matrix1[2] = m[2]; matrix1[3] = m[5];
matrix2[0] = m[8];
matrix4[0] = m[0]; matrix4[1] = m[1]; matrix4[2] = m[0] * 2; matrix4[3] = m[1] * 2;
out_shape = attr[1]->shape;
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = gpu_align_p2(
(out_shape->data[0] + gpu_param.global_scale[0] - 1)
/ gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (
(out_shape->data[1] + gpu_param.global_scale[1] - 1)
/ gpu_param.global_scale[1]);
gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
status = vsi_nn_kernel_gpu_add_param( node,
"matrix0", &matrix0 );
status |= vsi_nn_kernel_gpu_add_param( node,
"matrix1", &matrix1 );
status |= vsi_nn_kernel_gpu_add_param( node,
"matrix2", &matrix2 );
status |= vsi_nn_kernel_gpu_add_param( node,
"matrix4", &matrix4 );
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
if (attr[1])
{
vsi_nn_kernel_tensor_attr_release( &attr[1] );
attr[1] = NULL;
}
return status;
} /* _custom_warp_perspective_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t type
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _custom_warp_perspective_kernel_map;
size_t kernel_map_size = _cnt_of_array( _custom_warp_perspective_kernel_map );
vx_param_description_t * param_def = _custom_warp_perspective_kernel_param_def;
vx_kernel_initialize_f initializer = _custom_warp_perspective_initializer;
int32_t is_2d_img = inputs[0]->attr.dim_num < 3 || inputs[0]->attr.size[2] == 1;
uint32_t key = 0;
uint32_t i = 0;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = CUSTOM_WARP_PERSPECTIVE_HASH_KEY( in_dtype, out_dtype, type, is_2d_img );
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _custom_warp_perspective_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"vsi_nn_kernel_header",
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_CUSTOM_WARP_PERSPECTIVE_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
size_t i = 0;
size_t buffer_size = 0;
int32_t type = vsi_nn_kernel_param_get_int32( params, "type");
float * buffer = (float*)vsi_nn_kernel_param_get_const_buffer( params, "matrix", &buffer_size );
if (vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs, type );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
vx_border_t border;
border.mode = VX_BORDER_CONSTANT;
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM,
inputs, input_num, outputs, output_num );
for (i = 0; i < buffer_size; i++)
{
node_params[SCALAR_MATRIX_OFFSET + i] = vsi_nn_kernel_scalar_create(
graph, F32, &buffer[i] );
}
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _CUSTOM_WARP_PERSPECTIVE_PARAM_NUM );
for (i = 0; i < buffer_size; i++)
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_MATRIX_OFFSET + i] );
}
// Set default border mode.
border.constant_value.U32 = 0xcdcdcdcd;
status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
CHECK_STATUS(status);
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_EVIS( custom_warp_perspective, _setup )

View File

@ -0,0 +1,136 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
typedef struct _ainr_denoise_postprocess_local_data_t {
int32_t placeholder;
} ainr_denoise_postprocess_local_data_t;
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_status status = VSI_SUCCESS;
#if defined(VX_DENOISE_POSTPROCESS_SUPPORT) && VX_DENOISE_POSTPROCESS_SUPPORT
self->n = vxDenoisePostProcesslayer(
self->graph->g,
REQUIRED_IO(inputs[0]), // currInput
REQUIRED_IO(inputs[1]), // nnOutput
REQUIRED_IO(inputs[2]), // preOutImg
REQUIRED_IO(inputs[3]), // S0
REQUIRED_IO(inputs[4]), // C0
REQUIRED_IO(inputs[5]), // C1
REQUIRED_IO(inputs[6]), // C2
REQUIRED_IO(inputs[7]), // C3
REQUIRED_IO(inputs[8]), // clampMin
REQUIRED_IO(inputs[9]), // clampMax
REQUIRED_IO(outputs[0]) // output
);
#else
self->n = NULL;
#endif
if(NULL == self->n)
{
VSILOGE( "Create vxDenoisePostProcesslayer fail." );
return VSI_FAILURE;
}
return status;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
return TRUE;
} /* op_setup() */
static vsi_status op_init
(
vsi_nn_node_t* self
)
{
return VSI_SUCCESS;
} /* op_init() */
static vsi_status op_deinit
(
vsi_nn_node_t* self
)
{
vsi_status status = VSI_SUCCESS;
status = vsi_nn_op_common_deinit(self);
return status;
} /* op_deinit() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ CUSTOM_AINR_DENOISE_POSTPROCESS,
/* init */ op_init,
/* compute */ op_compute,
/* deinit */ op_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ 10,
/* output_num */ 1
);
__END_DECLS

View File

@ -0,0 +1,136 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
typedef struct _custom_warp_affine_local_data_t {
int32_t placeholder;
} custom_warp_affine_local_data_t;
/*
Declare number of input and output.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_custom_warp_affine_param * p;
p = &(self->nn_param.custom_warp_affine);
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
vsi_nn_kernel_param_add_int32( param, "type", p->type);
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"custom_warp_affine",
inputs, 1,
outputs, 1, param );
vsi_nn_kernel_param_release( &param );
return VSI_SUCCESS;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
uint32_t i = 0;
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
outputs[0]->attr.size[0] = self->nn_param.custom_warp_affine.size[0];
outputs[0]->attr.size[1] = self->nn_param.custom_warp_affine.size[1];
for (i = 2; i < outputs[0]->attr.dim_num; i++)
{
outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
}
}
return TRUE;
} /* op_setup() */
static vsi_status op_deinit
(
vsi_nn_node_t* self
)
{
vsi_status status = VSI_SUCCESS;
status = vsi_nn_op_common_deinit(self);
return status;
} /* op_deinit() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ CUSTOM_WARP_AFFINE,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ op_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -0,0 +1,136 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <string.h>
#include <stdlib.h>
#include "vsi_nn_types.h"
#include "vsi_nn_log.h"
#include "vsi_nn_node.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_ops.h"
#include "vsi_nn_tensor.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
typedef struct _custom_warp_perspective_local_data_t {
int32_t placeholder;
} custom_warp_perspective_local_data_t;
/*
Declare number of input and output.
*/
#define _INPUT_NUM (1)
#define _OUTPUT_NUM (1)
static vsi_status op_compute
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
vsi_nn_kernel_param_t * param = NULL;
vsi_nn_custom_warp_affine_param * p;
p = &(self->nn_param.custom_warp_affine);
param = vsi_nn_kernel_param_create();
vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 9 );
vsi_nn_kernel_param_add_int32( param, "type", p->type);
self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
"custom_warp_perspective",
inputs, 1,
outputs, 1, param );
vsi_nn_kernel_param_release( &param );
return VSI_SUCCESS;
} /* op_compute() */
static vsi_bool op_check
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
/*TODO: Check tensor shapes. */
return TRUE;
} /* op_check() */
static vsi_bool op_setup
(
vsi_nn_node_t * self,
vsi_nn_tensor_t ** inputs,
vsi_nn_tensor_t ** outputs
)
{
if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
{
uint32_t i = 0;
outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
outputs[0]->attr.size[0] = self->nn_param.custom_warp_perspective.size[0];
outputs[0]->attr.size[1] = self->nn_param.custom_warp_perspective.size[1];
for (i = 2; i < outputs[0]->attr.dim_num; i++)
{
outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
}
}
return TRUE;
} /* op_setup() */
static vsi_status op_deinit
(
vsi_nn_node_t* self
)
{
vsi_status status = VSI_SUCCESS;
status = vsi_nn_op_common_deinit(self);
return status;
} /* op_deinit() */
__BEGIN_DECLS
/* Registrar */
DEF_OP_REG
(
/* op_name */ CUSTOM_WARP_PERSPECTIVE,
/* init */ NULL,
/* compute */ op_compute,
/* deinit */ op_deinit,
/* check */ op_check,
/* setup */ op_setup,
/* optimize */ NULL,
/* input_num */ _INPUT_NUM,
/* output_num */ _OUTPUT_NUM
);
__END_DECLS

View File

@ -64,14 +64,16 @@ typedef struct
static const _kernel_map_type _clip_kernel_map[] =
{
PACK_KERNEL_MAP(F32, F32),
PACK_KERNEL_MAP(F32, U8),
PACK_KERNEL_MAP(U8, U8),
PACK_KERNEL_MAP(U8, F32),
PACK_KERNEL_MAP_2D(F32, F32),
PACK_KERNEL_MAP_2D(F32, U8),
PACK_KERNEL_MAP_2D(U8, U8),
PACK_KERNEL_MAP_2D(U8, F32),
PACK_KERNEL_MAP(F32, F32),
PACK_KERNEL_MAP(F32, U8),
PACK_KERNEL_MAP(U8, U8),
PACK_KERNEL_MAP(U8, F32),
PACK_KERNEL_MAP(BF16, BF16),
PACK_KERNEL_MAP_2D(F32, F32),
PACK_KERNEL_MAP_2D(F32, U8),
PACK_KERNEL_MAP_2D(U8, U8),
PACK_KERNEL_MAP_2D(U8, F32),
PACK_KERNEL_MAP_2D(BF16, BF16),
};

View File

@ -0,0 +1,226 @@
/****************************************************************************
*
* Copyright (c) 2019 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_error.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_eltwise.h"
__BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _DEPTH2SPACE_CRD_KERNEL_SOURCE "depth2space_crd"
// Add kernel hashtable here
#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F32TOF32 CVIVANTE_NAMESPACE("cl.depth2space_crd_F32toF32")
// Add kernel hashtable here
#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _blk_size) \
((_input0_type << 24) | (_output_type << 16) | (_blk_size << 8))
#define TENSOR_DEPTH2SPACE_CRD_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
{ HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 0), \
VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} depth2space_crd_map[] =
{
TENSOR_DEPTH2SPACE_CRD_KERNELS(F32, F32, _DEPTH2SPACE_CRD_KERNEL_SOURCE)
};
/*
* Kernel params
*/
static vx_param_description_t _depth2space_crd_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _DEPTH2SPACE_CRD_PARAM_NUM _cnt_of_array( _depth2space_crd_kernel_param_def )
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
3,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
int32_t output_dims = 0;
int32_t output_width = 0;
int32_t output_height = 0;
int32_t output_chn = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
output_dims = (int32_t)attr[0]->shape->size;
output_width = (int32_t)(attr[0]->shape->data[0]);
output_height = (int32_t)(attr[0]->shape->data[1]);
output_chn = (int32_t)(output_dims > 2 ? attr[0]->shape->data[2] : 1);
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.global_scale[2] = 1;
gpu_param.global_size[0] = output_width;
gpu_param.global_size[1] = output_height;
gpu_param.global_size[2] = output_chn;
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final);
final:
if (attr[0])
{
vsi_nn_kernel_tensor_attr_release( &attr[0] );
attr[0] = NULL;
}
return status;
} /* _depth2space_crd_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e input0_dtype = U8;
vsi_nn_kernel_dtype_e output_dtype = U8;
uint32_t key = 0;
int i = 0;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, 0 );
for ( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ )
{
if ( depth2space_crd_map[i].key == key )
{
break;
}
}
if ( i < _cnt_of_array(depth2space_crd_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", depth2space_crd_map[i].function_name );
kernel->info.parameters = _depth2space_crd_kernel_param_def;
kernel->info.numParams = _DEPTH2SPACE_CRD_PARAM_NUM;
kernel->info.initialize = _depth2space_crd_initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
"eltwise_ops_helper",
depth2space_crd_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
depth2space_crd_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_DEPTH2SPACE_CRD_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Pass parameters to node. */
vsi_nn_kernel_node_pack_io( node_params, _DEPTH2SPACE_CRD_PARAM_NUM,
inputs, 1, outputs, 1 );
node_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _DEPTH2SPACE_CRD_PARAM_NUM );
vsi_nn_kernel_scalar_release( &node_params[2] );
}
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( depth2space_internal, _setup )

View File

@ -42,6 +42,7 @@ __BEGIN_DECLS
typedef enum
{
UNARY_SIN,
UNARY_COS,
UNARY_EXP,
UNARY_LOG,
UNARY_ELU,
@ -89,6 +90,7 @@ typedef enum
VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
#define SIN_OPERATION sin
#define COS_OPERATION cos
#define EXP_OPERATION exp
#define LOG_OPERATION log
#define ELU_OPERATION elu
@ -107,6 +109,8 @@ static const struct {
{
TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION, UNARY_SIN, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT(SIN_OPERATION, UNARY_SIN, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION, UNARY_COS, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT(COS_OPERATION, UNARY_COS, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION, UNARY_EXP, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT(EXP_OPERATION, UNARY_EXP, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT(LOG_OPERATION, UNARY_LOG, F32, F32)
@ -128,6 +132,8 @@ static const struct {
TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION, UNARY_SIN, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION, UNARY_COS, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT_2D(COS_OPERATION, UNARY_COS, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION, UNARY_EXP, F32, F32)
TENSOR_UNARY_KERNELS_FLOAT_2D(EXP_OPERATION, UNARY_EXP, F16, F16)
TENSOR_UNARY_KERNELS_FLOAT_2D(LOG_OPERATION, UNARY_LOG, F32, F32)
@ -148,6 +154,7 @@ static const struct {
TENSOR_UNARY_KERNELS_FLOAT_2D(HGELU_OPERATION, UNARY_HGELU, F16, F16)
TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, U8, U8)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8, U8)
TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, U8, U8)
TENSOR_UNARY_KERNELS(LOG_OPERATION, UNARY_LOG, U8, U8)
TENSOR_UNARY_KERNELS(ELU_OPERATION, UNARY_ELU, U8, U8)
@ -159,6 +166,7 @@ static const struct {
TENSOR_UNARY_KERNELS(HGELU_OPERATION, UNARY_HGELU, U8, U8)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, U8, U8)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, U8, U8)
TENSOR_UNARY_KERNELS_2D(LOG_OPERATION, UNARY_LOG, U8, U8)
TENSOR_UNARY_KERNELS_2D(ELU_OPERATION, UNARY_ELU, U8, U8)
@ -175,6 +183,7 @@ static const struct {
};
#undef SIN_OPERATION
#undef COS_OPERATION
#undef EXP_OPERATION
#undef LOG_OPERATION
#undef ELU_OPERATION
@ -438,6 +447,7 @@ OnError:
REGISTER_ELTWISE_UNARY_BACKEND_CL( sin, UNARY_SIN )
REGISTER_ELTWISE_UNARY_BACKEND_CL( cos, UNARY_COS )
REGISTER_ELTWISE_UNARY_BACKEND_CL( exp, UNARY_EXP )
REGISTER_ELTWISE_UNARY_BACKEND_CL( log, UNARY_LOG )
REGISTER_ELTWISE_UNARY_BACKEND_CL( elu, UNARY_ELU )

View File

@ -103,7 +103,6 @@ static vx_param_description_t _floordiv_kernel_param_def[] =
#define SCALAR_OUTPUT_SCALE (7)
#define SCALAR_OUTPUT_TAIL (8)
#define FLOORDIV_PARAM_NUM 3
#define FLOORDIV_QUANT_PARAM_NUM _cnt_of_array( _floordiv_kernel_param_def )
/*
@ -154,8 +153,6 @@ final:
return status;
} /* _floordiv_initializer() */
/*
* Query kernel
*/
@ -164,8 +161,7 @@ static vsi_status _query_kernel
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
vsi_bool image_2d,
vsi_bool *is_use_u8_kernel
vsi_bool image_2d
)
{
vsi_status status = VSI_FAILURE;
@ -189,7 +185,7 @@ static vsi_status _query_kernel
{
in0_dtype = F32;
}
else if (I16 == in0_dtype)
else if (I16 == in0_dtype || I8 == in0_dtype)
{
in0_dtype = I32;
}
@ -198,7 +194,7 @@ static vsi_status _query_kernel
{
in1_dtype = F32;
}
else if (I16 == in1_dtype)
else if (I16 == in1_dtype || I8 == in1_dtype)
{
in1_dtype = I32;
}
@ -207,16 +203,9 @@ static vsi_status _query_kernel
{
out_dtype = F32;
}
if ((U8 == in0_dtype) || (U8 == in1_dtype) || (U8 == out_dtype))
else if (I16 == out_dtype || I8 == out_dtype)
{
param_def_size = FLOORDIV_QUANT_PARAM_NUM;
*is_use_u8_kernel = TRUE;
}
else
{
param_def_size = FLOORDIV_PARAM_NUM;
*is_use_u8_kernel = FALSE;
out_dtype = I32;
}
key = FLOORDIV_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d);
@ -228,7 +217,7 @@ static vsi_status _query_kernel
break;
}
}
if( i < kernel_map_size )
if ( i < kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
@ -262,19 +251,18 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_bool image_2d = FALSE;
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
vsi_bool is_use_u8_kernel = FALSE;
float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
float outputTail = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
outputScale = 1.0f / outputScale;
input0Tail = -(input0Tail * input0Scale);
input1Tail = -(input1Tail * input1Scale);
if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
@ -282,40 +270,35 @@ static vsi_nn_kernel_node_t _setup
image_2d = (outputs[0]->attr.dim_num == 2);
status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel);
if( VSI_SUCCESS == status)
status = _query_kernel( kernel, inputs, outputs, image_2d);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
size_t node_params_num = FLOORDIV_PARAM_NUM;
size_t node_params_num = FLOORDIV_QUANT_PARAM_NUM;
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _FLOORDIV_PARAM_NUM,
inputs, input_num, outputs, output_num );
if (is_use_u8_kernel)
{
node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
node_params_num = FLOORDIV_QUANT_PARAM_NUM;
}
node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input0Tail );
node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &input1Tail );
node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
node_params[SCALAR_OUTPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &outputTail );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
VSI_ASSERT( status == VSI_SUCCESS );
if (is_use_u8_kernel)
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
}
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT0_TAIL] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT1_TAIL] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
}
}
return node;
} /* _setup() */

View File

@ -47,7 +47,8 @@ typedef enum
INTERNAL_KERNEL_GATHER,
} _internal_kernel_e;
#define _GATHER_KERNEL_SOURCE "gather"
#define _GATHER_KERNEL_SOURCE "gather"
#define _GATHER_BATCH_KERNEL_SOURCE "gather_batch"
// Add kernel hashtable here
#define VX_KERNEL_NAME_GATHER_U8TOU8 CVIVANTE_NAMESPACE("cl.gather_U8toU8")
@ -55,25 +56,39 @@ typedef enum
#define VX_KERNEL_NAME_GATHER_I32TOI32 CVIVANTE_NAMESPACE("cl.gather_I32toI32")
#define VX_KERNEL_NAME_GATHER_F32TOF32 CVIVANTE_NAMESPACE("cl.gather_F32toF32")
#define VX_KERNEL_NAME_GATHER_BATCH_U8TOU8 CVIVANTE_NAMESPACE("cl.gather_batch_U8toU8")
#define VX_KERNEL_NAME_GATHER_BATCH_F16TOF16 CVIVANTE_NAMESPACE("cl.gather_batch_F16toF16")
#define VX_KERNEL_NAME_GATHER_BATCH_I32TOI32 CVIVANTE_NAMESPACE("cl.gather_batch_I32toI32")
#define VX_KERNEL_NAME_GATHER_BATCH_F32TOF32 CVIVANTE_NAMESPACE("cl.gather_batch_F32toF32")
// Add kernel hashtable here
#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d))
#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _image_2d, _batch) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_image_2d << 4) | (_batch))
#define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_BATCH_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \
VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
const char* source_name;
} gather_map[] =
{
TENSOR_GATHER_KERNELS(U8, I32, U8, _GATHER_KERNEL_SOURCE)
TENSOR_GATHER_KERNELS(U8, I32, U8, _GATHER_KERNEL_SOURCE)
TENSOR_GATHER_KERNELS(F16, I32, F16, _GATHER_KERNEL_SOURCE)
TENSOR_GATHER_KERNELS(I32, I32, I32, _GATHER_KERNEL_SOURCE)
TENSOR_GATHER_KERNELS(F32, I32, F32, _GATHER_KERNEL_SOURCE)
TENSOR_GATHER_BATCH_KERNELS(U8, I32, U8, _GATHER_BATCH_KERNEL_SOURCE)
TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16, _GATHER_BATCH_KERNEL_SOURCE)
TENSOR_GATHER_BATCH_KERNELS(I32, I32, I32, _GATHER_BATCH_KERNEL_SOURCE)
TENSOR_GATHER_BATCH_KERNELS(F32, I32, F32, _GATHER_BATCH_KERNEL_SOURCE)
};
/*
@ -88,6 +103,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def )
@ -97,6 +113,7 @@ static vsi_status cal_gather_tensor_reshape_size
vsi_nn_tensor_t ** inputs,
vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
uint32_t block_size,
vsi_size_t batch_dims,
uint32_t idxFlg
)
{
@ -105,30 +122,37 @@ static vsi_status cal_gather_tensor_reshape_size
vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t i = 0;
vsi_size_t elementCnt = 1;
vsi_size_t outerCnt = 1;
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
for(i = 0; i < dims_num; ++i)
for (i = 0; i < dims_num - batch_dims; ++i)
{
elementCnt *= input_size[i];
}
for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
for (; i < dims_num; ++i)
{
outerCnt *= input_size[i];
}
for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
{
sizes[i] = 1;
}
if(idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
{
sizes[0] = elementCnt;
sizes[1] = 1;
sizes[1] = outerCnt;
status = VSI_SUCCESS;
}
else
{
if((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
{
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
sizes[2] = outerCnt;
status = VSI_SUCCESS;
}
}
@ -160,9 +184,9 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
vsi_size_array_t * input1_shape = NULL;
int32_t block_size = 0;
int32_t block_num = 0;
vsi_ssize_t indices_num = 1;
size_t input_dims1 = 0;
size_t i = 0;
vsi_ssize_t indices_num = 1;
size_t input_dims1 = 0;
size_t i = 0;
attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
@ -176,7 +200,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
input1_shape = attr[1]->shape;
input_dims1 = input1_shape->size;
for (i = 0; i < input_dims1; i++)
for (i = 0; i < input_dims1 - 1; i++)
{
indices_num *= input1_shape->data[i];
}
@ -214,7 +238,8 @@ static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs
vsi_nn_tensor_t * const * const outputs,
int32_t is_batch
/* Add extra params */
)
{
@ -227,17 +252,17 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0 );
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0, is_batch );
for( i = 0; i < _cnt_of_array(gather_map); i ++ )
for ( i = 0; i < _cnt_of_array(gather_map); i ++ )
{
if( gather_map[i].key == key )
if ( gather_map[i].key == key )
{
break;
}
}
if( i < _cnt_of_array(gather_map) )
if ( i < _cnt_of_array(gather_map) )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", gather_map[i].function_name );
kernel->info.parameters = _gather_kernel_param_def;
@ -271,54 +296,69 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t node_params[_GATHER_PARAM_NUM] = {NULL};
vsi_nn_kernel_node_t node = NULL;
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" );
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" );
int32_t indices_num = vsi_nn_kernel_param_get_int32( params, "indices_num" );
int32_t is_batch = batch_dims > 0 ? 1 : 0;
vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3;
int32_t i = 0;
status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
if(status != VSI_SUCCESS)
status = cal_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0);
status |= cal_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1);
status |= cal_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0);
if (status != VSI_SUCCESS)
{
return NULL;
}
if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], rs_dim );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
inputs[1], shapes[1], 2 );
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[2], rs_dim );
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
if( VSI_SUCCESS == status)
status = _query_kernel( kernel, inputs, outputs, is_batch );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 0;
#define RESHAPE_DIM 2
uint32_t index = 3;
int32_t batch = (int32_t)shapes[1][1];
/* Pass parameters to node. */
node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], RESHAPE_DIM );
node_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], RESHAPE_DIM );
node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM );
#undef RESHAPE_DIM
vsi_nn_kernel_node_pack_io( node_params, _GATHER_PARAM_NUM,
reshape_tensors, 2, &reshape_tensors[2], 1 );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &indices_num );
node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, node_params, _GATHER_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &node_params[0] );
vsi_nn_kernel_tensor_release( &node_params[1] );
vsi_nn_kernel_tensor_release( &node_params[2] );
vsi_nn_kernel_scalar_release( &node_params[3] );
vsi_nn_kernel_scalar_release( &node_params[4] );
vsi_nn_kernel_scalar_release( &node_params[5] );
vsi_nn_kernel_scalar_release( &node_params[6] );
vsi_nn_kernel_scalar_release( &node_params[7] );
}
}
for (i = 0; i < 3; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
} /* _setup() */

View File

@ -22,7 +22,6 @@
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -445,45 +444,6 @@ static vsi_status _query_kernel
return status;
} /* _query_kernel() */
static int32_t _optimize_gn_shape_cl
(
vsi_nn_tensor_t ** inputs,
vsi_size_t group_size,
int32_t group_num,
vsi_size_t* opt_shape,
int32_t* is2D_flg
)
{
vsi_status status = VSI_SUCCESS;
vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
vsi_size_t new_rank = 0;
group_shape[0] = inputs[0]->attr.size[0];
group_shape[1] = inputs[0]->attr.size[1];
group_shape[2] = group_size;
vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
if (opt_shape[1] == 1)
{
opt_shape[1] = group_num;
opt_shape[2] = 1;
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
is2D_flg[0] = 1;
}
else if (new_rank == 2)
{
opt_shape[2] = group_num;
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
}
else
{
status = VSI_FAILURE;
}
return status;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
@ -535,11 +495,13 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
status = _optimize_gn_shape_cl(inputs, group_size, group_num, new_shape, &is2D_flg);
status = vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size,
inputs[0]->attr.dim_num, group_num, 0, new_shape);
if ( VSI_SUCCESS != status )
{
goto final;
}
is2D_flg = (new_shape[2] == 1) && ((int32_t)new_shape[1] == group_num);
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);

View File

@ -406,12 +406,12 @@ static vsi_nn_kernel_node_t _setup
uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
uint32_t hashkey = 0;
int32_t i = 0;
uint32_t rank = outputs[0]->attr.dim_num;
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
int32_t reshape_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
size_t width = inputs[0]->attr.size[0];
size_t height = inputs[0]->attr.size[1];
int32_t reshape_flg = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
&& rank > 2;
int32_t group_num = (int32_t)(width + 15) / 16;
int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
float input_scale = vsi_nn_get_tensor_scale(inputs[0]);

View File

@ -101,18 +101,23 @@ static const _kernel_map_type moments_map[] =
TENSOR_MOMENTS_KERNELS(U8, F32, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS(F32, F32, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS(I32, F32, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS(BF16,F32, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS(U8, F32, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS(F32, F32, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS(I32, F32, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS(BF16,F32, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS(U8, F32, 2, KERNEL_SOURCE_3)
TENSOR_MOMENTS_KERNELS(F32, F32, 2, KERNEL_SOURCE_3)
TENSOR_MOMENTS_KERNELS(I32, F32, 2, KERNEL_SOURCE_3)
TENSOR_MOMENTS_KERNELS(BF16,F32, 2, KERNEL_SOURCE_3)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F32, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,F32, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F32, 0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(BF16,F32, 0, 1, 2, KERNEL_SOURCE_5)
};
/*

View File

@ -0,0 +1,301 @@
/****************************************************************************
*
* Copyright (c) 2020 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_graph.h"
#include "vsi_nn_log.h"
#include "vsi_nn_error.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "utils/vsi_nn_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "libnnext/vx_lib_nnext.h"
__BEGIN_DECLS
#define _TOPK_KERNEL_SOURCE "topk"
#define STR(a) #a
// Add kernel hashtable here
#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
{ TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
_TOPK_KERNEL_SOURCE }
typedef struct
{
uint32_t key;
char * function_name;
const char * source_name;
} _kernel_map_type;
static const _kernel_map_type _topk_kernel_map[] =
{
// Register kernel here
PACK_KERNEL_MAP( F32, F32, 0 ),
PACK_KERNEL_MAP( F32, F32, 1 ),
PACK_KERNEL_MAP( F32, F32, 2 ),
PACK_KERNEL_MAP( F32, F32, 3 ),
PACK_KERNEL_MAP( F32, F32, 4 ),
PACK_KERNEL_MAP( F32, F32, 5 ),
PACK_KERNEL_MAP( F32, F32, 6 ),
PACK_KERNEL_MAP( U32, U32, 0 ),
PACK_KERNEL_MAP( U32, U32, 1 ),
PACK_KERNEL_MAP( U32, U32, 2 ),
PACK_KERNEL_MAP( U32, U32, 3 ),
PACK_KERNEL_MAP( U32, U32, 4 ),
PACK_KERNEL_MAP( U32, U32, 5 ),
PACK_KERNEL_MAP( U32, U32, 6 ),
PACK_KERNEL_MAP( I32, I32, 0 ),
PACK_KERNEL_MAP( I32, I32, 1 ),
PACK_KERNEL_MAP( I32, I32, 2 ),
PACK_KERNEL_MAP( I32, I32, 3 ),
PACK_KERNEL_MAP( I32, I32, 4 ),
PACK_KERNEL_MAP( I32, I32, 5 ),
PACK_KERNEL_MAP( I32, I32, 6 ),
};
/*
* Kernel params
*/
static vx_param_description_t _topk_kernel_param_def[] =
{
{VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _TOPK_PARAM_NUM _cnt_of_array( _topk_kernel_param_def )
#define SCALAR_INPUT_NUM_STAGES (3)
#define SCALAR_INPUT_WIDTH (4)
/*
* Kernel initializer
*/
DEF_KERNEL_INITIALIZER(_topk_initializer)
(
vsi_nn_kernel_node_t node,
const vsi_nn_kernel_node_param_t * param,
size_t param_size
)
{
vsi_status status = VSI_FAILURE;
gpu_param_t gpu_param = {
2,
{0, 0, 0},
{0, 0, 0},
{0, 0, 0},
{0, 0, 0}
};
vsi_nn_kernel_tensor_attr_t * input_attr = NULL;
vsi_size_array_t * in_shape = NULL;
int32_t num_stages = 0;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_NUM_STAGES], &num_stages);
CHECK_STATUS_FAIL_GOTO(status, final );
in_shape = input_attr->shape;
gpu_param.global_scale[0] = 1;
gpu_param.global_scale[1] = 1;
gpu_param.local_size[0] = (size_t)(1 << num_stages);
gpu_param.local_size[1] = 1;
gpu_param.global_size[0] = (size_t)(1 << num_stages);
gpu_param.global_size[1] = in_shape->data[1];
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
final:
#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
SAFE_FREE_TENSOR_ATTR(input_attr);
return status;
} /* _topk_initializer() */
/*
* Query kernel
*/
static vsi_status _query_kernel
(
vsi_nn_kernel_t * kernel,
vsi_nn_tensor_t * const * const inputs,
vsi_nn_tensor_t * const * const outputs,
int32_t num_stages
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_dtype_e in_dtype;
vsi_nn_kernel_dtype_e out_dtype;
const _kernel_map_type * kernel_map = _topk_kernel_map;
size_t kernel_map_size = _cnt_of_array( _topk_kernel_map );
vx_param_description_t * param_def = _topk_kernel_param_def;
vx_kernel_initialize_f initializer = _topk_initializer;
#define _PACK_SELECT_KEY( in_type, out_type ) \
( (in_type) | (out_type << 8) )
uint32_t key = 0;
uint32_t i;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
{
case _PACK_SELECT_KEY(F32, F32):
case _PACK_SELECT_KEY(F16, F16):
key = TOPK_HASH_KEY( F32, F32, num_stages );
break;
case _PACK_SELECT_KEY(U32, U32):
case _PACK_SELECT_KEY(U16, U16):
case _PACK_SELECT_KEY(U8, U8):
key = TOPK_HASH_KEY( U32, U32, num_stages );
break;
case _PACK_SELECT_KEY(I32, I32):
case _PACK_SELECT_KEY(I16, I16):
case _PACK_SELECT_KEY(I8, I8):
key = TOPK_HASH_KEY( I32, I32, num_stages );
break;
default:
break;
}
#undef _PACK_SELECT_KEY
for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
{
if ( kernel_map[i].key == key )
{
break;
}
}
if ( i < (uint32_t)kernel_map_size )
{
snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s", kernel_map[i].function_name );
kernel->info.parameters = param_def;
kernel->info.numParams = _cnt_of_array( _topk_kernel_param_def );
kernel->info.initialize = initializer;
// Register code source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
kernel_map[i].source_name );
// Register binary source
vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
kernel_map[i].source_name );
status = VSI_SUCCESS;
}
return status;
} /* _query_kernel() */
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vsi_status status = VSI_FAILURE;
vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
vsi_nn_kernel_node_t node = NULL;
vsi_size_t block_size = inputs[0]->attr.size[0];
vsi_size_t block_num = 1;
uint32_t i = 0;
vsi_nn_tensor_t* rs_tensors[3] = { NULL };
vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
int32_t width = (int32_t)block_size;
int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
for (i = 1; i < inputs[0]->attr.dim_num; i ++)
{
block_num = block_num * inputs[0]->attr.size[i];
}
if( vsi_nn_is_same_type(inputs[0], outputs[0]) == FALSE ||
outputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_INT32 )
{
return NULL;
}
shape[0][0] = block_size;
shape[0][1] = block_num;
shape[1][0] = top_k;
shape[1][1] = block_num;
rs_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shape[0], 2 );
rs_tensors[1] = vsi_nn_reshape_tensor( graph,
outputs[0], shape[1], 2 );
rs_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[1], shape[1], 2 );
status = _query_kernel( kernel, inputs, outputs, num_stages );
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
rs_tensors, input_num, &rs_tensors[1], output_num );
/* Pass parameters to node. */
node_params[SCALAR_INPUT_NUM_STAGES] = vsi_nn_kernel_scalar_create(
graph, I32, &num_stages );
node_params[SCALAR_INPUT_WIDTH] = vsi_nn_kernel_scalar_create(
graph, I32, &width );
status = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
CHECK_STATUS_FAIL_GOTO( status, final );
}
}
final:
vsi_safe_release_tensor(rs_tensors[0]);
vsi_safe_release_tensor(rs_tensors[1]);
vsi_safe_release_tensor(rs_tensors[2]);
if (node_params[SCALAR_INPUT_NUM_STAGES])
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_NUM_STAGES] );
}
if (node_params[SCALAR_INPUT_WIDTH])
{
vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_WIDTH] );
}
return node;
} /* _setup() */
__END_DECLS
REGISTER_BACKEND_CL( topk, _setup )

View File

@ -40,6 +40,7 @@ __BEGIN_DECLS
typedef enum
{
UNARY_SIN,
UNARY_COS,
UNARY_EXP,
UNARY_LOG,
UNARY_ELU,
@ -69,6 +70,11 @@ static float sin_eval(float data)
return sinf(data);
}
static float cos_eval(float data)
{
return cosf(data);
}
static float log_eval(float data)
{
return logf(data);
@ -212,6 +218,9 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
case UNARY_SIN:
data = sin_eval(data);
break;
case UNARY_COS:
data = cos_eval(data);
break;
case UNARY_EXP:
data = exp_eval(data);
break;
@ -372,6 +381,7 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_ELTWISE_UNARY_BACKEND_CPU( sin, UNARY_SIN )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( cos, UNARY_COS )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( exp, UNARY_EXP )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( log, UNARY_LOG )
REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu, UNARY_ELU )

View File

@ -42,7 +42,7 @@ __BEGIN_DECLS
/*
* Define kernel meta.
*/
#define _CPU_ARG_NUM (3)
#define _CPU_ARG_NUM (4)
#define _CPU_INPUT_NUM (2)
#define _CPU_OUTPUT_NUM (1)
#define _CPU_IO_NUM (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
@ -62,9 +62,9 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
uint32_t* buffer_idx = NULL;
size_t in_elements = 0, out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
vsi_size_t i = 0, j = 0;
int32_t block_size = 1, block_num = 1, axis_num = 0;
vsi_size_t indices_num = 1;
vsi_size_t i = 0, j = 0, b = 0;
int32_t block_size = 1, block_num = 1, axis_num = 0, batch_dims = 0;
vsi_size_t indices_num = 1, batch = 1, in_stride = 1, out_stride = 1;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
@ -86,6 +86,8 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis_num);
CHECK_STATUS_FAIL_GOTO(status, final );
status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &batch_dims);
CHECK_STATUS_FAIL_GOTO(status, final );
buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
@ -98,26 +100,44 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
memset( buffer[1], 0, out_elements * sizeof(float) );
{
for(i = 0; i < attr[1]->shape->size; ++i)
for (i = 0; i < attr[1]->shape->size - (vsi_size_t)batch_dims; i++)
{
indices_num *= attr[1]->shape->data[i];
}
for(i = 0; i < (vsi_size_t)block_num; i++)
for (; i < attr[1]->shape->size; i++)
{
for(j = 0; j < indices_num; j++)
batch *= attr[1]->shape->data[i];
}
for (i = 0; i < attr[0]->shape->size - (vsi_size_t)batch_dims; i++)
{
in_stride *= attr[0]->shape->data[i];
}
for (i = 0; i < attr[2]->shape->size - (vsi_size_t)batch_dims; i++)
{
out_stride *= attr[2]->shape->data[i];
}
for (b = 0; b < batch; b++)
{
for (i = 0; i < (vsi_size_t)block_num; i++)
{
uint32_t indice = buffer_idx[j];
vsi_size_t in_index = (i * axis_num + indice) * block_size;
if(in_index < in_elements)
for (j = 0; j < indices_num; j++)
{
vsi_size_t out_index = (i * indices_num + j) * block_size;
memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
}
else
{
status = VX_FAILURE;
CHECK_STATUS_FAIL_GOTO( status, final );
uint32_t indice = buffer_idx[j + indices_num * b];
vsi_size_t in_index = (i * axis_num + indice) * block_size + b * in_stride;
if (in_index < in_elements)
{
vsi_size_t out_index = (i * indices_num + j) * block_size + b * out_stride;
memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
}
else
{
status = VX_FAILURE;
CHECK_STATUS_FAIL_GOTO( status, final );
}
}
}
}
@ -128,20 +148,20 @@ DEF_KERNEL_EXECUTOR(_gather_exec)
CHECK_STATUS_FAIL_GOTO( status, final );
final:
if( buffer_idx )
if ( buffer_idx )
{
free( buffer_idx );
}
for( i = 0; i < 2; i ++ )
for ( i = 0; i < 2; i ++ )
{
if( buffer[i] )
if ( buffer[i] )
{
free( buffer[i] );
}
}
for( i = 0; i < _CPU_IO_NUM; i ++ )
for ( i = 0; i < _CPU_IO_NUM; i ++ )
{
if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
}
return status;
} /* _gather_exec() */
@ -156,6 +176,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
{VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
// Add kererl parameters here
};
#define _GATHER_PARAM_NUM _cnt_of_array( _gather_kernel_param_def )
@ -201,15 +222,16 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_t node = NULL;
status = _query_kernel( inputs, outputs, kernel );
if( VSI_SUCCESS == status)
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if( node )
if ( node )
{
uint32_t index = 3;
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" );
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" );
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" );
int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
/* Set inputs and outputs */
vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
@ -218,12 +240,14 @@ static vsi_nn_kernel_node_t _setup
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &batch_dims );
/* Pass parameters to node. */
status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
CHECK_STATUS( status );
vsi_nn_kernel_scalar_release( &backend_params[3] );
vsi_nn_kernel_scalar_release( &backend_params[4] );
vsi_nn_kernel_scalar_release( &backend_params[5] );
vsi_nn_kernel_scalar_release( &backend_params[6] );
}
else
{

View File

@ -103,9 +103,10 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec)
if(coord_stride <= 4) // reshape 3D
{
vsi_ssize_t stride[4] = {block_size, 0, 0, 0};
int32_t start_dim = (int32_t)attr[0]->shape->size - coord_stride;
for(i = 1; i < coord_stride; ++i)
{
stride[i] = stride[i - 1] * attr[0]->shape->data[i];
stride[i] = stride[i - 1] * attr[0]->shape->data[start_dim + i - 1];
}
for(i = 0; i < indices_num; i++)
@ -118,8 +119,8 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec)
for(j = 0; j < coord_stride; j++)
{
coord[j] = buffer_idx[i * coord_stride + j];
in_index += coord[j] * stride[j];
}
in_index = coord[3] * stride[3] + coord[2] * stride[2] + coord[1] * stride[1] + coord[0] * stride[0];
memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
}
}

View File

@ -61,7 +61,13 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec)
float * buffer[_CPU_IO_NUM] = { NULL };
size_t out_elements = 0;
vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
uint32_t i = 0;
vsi_size_t batch = 1;
vsi_size_t depth = 1;
vsi_size_t norm_size = 1;
vsi_size_t b = 0;
vsi_size_t c = 0;
vsi_size_t i = 0;
size_t rank = 1;
float eps = .0f;
tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
@ -96,62 +102,55 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec)
CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
memset( buffer[3], 0, out_elements * sizeof(float) );
rank = attr[0]->shape->size;
batch = attr[0]->shape->data[rank - 1];
depth = attr[0]->shape->data[rank - 2];
for ( i = 0; i < (vsi_size_t)rank - 2; i++)
{
vsi_size_t b = 0, c = 0, h = 0, w = 0;
vsi_size_t height = attr[0]->shape->data[1];
vsi_size_t width = attr[0]->shape->data[0];
vsi_size_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
vsi_size_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
norm_size *= attr[0]->shape->data[i];
}
for (b = 0; b < bh; b++)
for (b = 0; b < batch; b++)
{
for (c = 0; c < depth; c++)
{
for (c = 0; c < ch; c++)
vsi_size_t page = c * norm_size + b * norm_size * depth;
float sum = .0f;
float sumsq = .0f;
float mean = .0f;
float vari = .0f;
float data = 0;
float scaleVal = buffer[2][c];
float biasVal = buffer[1][c];
for (i = 0; i < norm_size; i++)
{
vsi_size_t page = c * (height * width) + b * (height * width * ch);
float sum = .0f;
float sumsq = .0f;
float mean = .0f;
float vari = .0f;
float data = 0;
float scaleVal = buffer[2][c];
float biasVal = buffer[1][c];
vsi_size_t index = page + i;
sum += buffer[0][index];
}
for (h = 0; h < height; h++)
{
vsi_size_t len = page + h * width;
mean = sum / (float)norm_size;
for (w = 0; w < width; w++)
{
vsi_size_t index = len + w;
sum += buffer[0][index];
}
}
mean = sum / (width * height);
for (h = 0; h < height; h++)
{
vsi_size_t len = page + h * width;
for (w = 0; w < width; w++)
{
vsi_size_t index = len + w;
data = buffer[0][index] - mean;
sumsq += data * data;
}
}
vari = sumsq / (width * height);
vari = (float)(1.0 / sqrtf(vari + eps));
for (h = 0; h < height; h++)
{
vsi_size_t len = page + h * width;
for (w = 0; w < width; w++)
{
float normVal = 0;
vsi_size_t index = len + w;
data = buffer[0][index] - mean;
for (i = 0; i < norm_size; i++)
{
vsi_size_t index = page + i;
data = buffer[0][index] - mean;
sumsq += data * data;
}
normVal = data * vari * scaleVal + biasVal;
buffer[3][index] = normVal;
}
}
vari = sumsq / (float)norm_size;
vari = (float)(1.0 / sqrtf(vari + eps));
for (i = 0; i < norm_size; i++)
{
float normVal = 0;
vsi_size_t index = page + i;
data = buffer[0][index] - mean;
normVal = data * vari * scaleVal + biasVal;
buffer[3][index] = normVal;
}
}
}
@ -256,4 +255,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CPU( instance_norm, _setup )

View File

@ -104,7 +104,6 @@ DEF_KERNEL_EXECUTOR(_compute)
in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
}
for(i = 0; i < _OUTPUT_NUM; i ++)
{
@ -311,4 +310,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_CPU( resize_bilinear, _setup )

View File

@ -63,6 +63,11 @@ __BEGIN_DECLS
CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_F16to"#OUT_DTYPE"_2D"), \
HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) },
#define HASH_ARGMAX_KERNELS_MIX_OPT( AXIS, IN_DTYPE, OUT_DTYPE) \
{ HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 2), \
CVIVANTE_NAMESPACE("evis.argmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_opt"), \
HASH_ARGMAX_KERNEL_SOURCE_NAME(AXIS) },
static const struct {
uint32_t key;
char* function_name;
@ -132,6 +137,8 @@ static const struct {
HASH_ARGMAX_KERNELS_2D(2, U8, I16)
HASH_ARGMAX_KERNELS_2D(2, I16, U8)
HASH_ARGMAX_KERNELS_2D(2, I16, I16)
HASH_ARGMAX_KERNELS_MIX_OPT(2, U8, I16)
HASH_ARGMAX_KERNELS_MIX_OPT(2, I8, I16)
};
static vx_param_description_t kernel_param_def[] =
@ -228,7 +235,18 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
if (attr[0]->dtype == I8 ||
attr[0]->dtype == U8)
{
if ( attr[1]->dtype == I8 ||
if (axis == 2 &&
input_shape->data[2] > 1 &&
((attr[1]->dtype == I8 || attr[1]->dtype == U8)
|| (attr[1]->dtype == I16 && input_shape->data[2] < 256)))
{
uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16)
| ((argLenSub1 & 0xFF) << 8) | (argLenSub1 & 0xFF);
packedArgIdx[0] = packedArgIdx[1] = pack;
packedArgIdx[2] = packedArgIdx[3] = pack;
gpu_param.global_scale[0] = 16;
}
else if ( attr[1]->dtype == I8 ||
attr[1]->dtype == U8)
{
uint32_t pack = ((argLenSub1 & 0xFF) << 24) | ((argLenSub1 & 0xFF) << 16)
@ -302,7 +320,6 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
}
break;
case 1:
case 2:
{
gpu_dp_inst_t uniExtractData_2x8 = {{
0x11111111, // TCfg
@ -324,6 +341,52 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer)
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case 2:
{
gpu_dp_inst_t uniExtractData_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtract1stU8toI16_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtract2ndU8toI16_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x0b0a0908, 0x0f0e0d0c, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node,
"uniExtractData_2x8", &uniExtractData_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtract1stU8toI16_2x8", &uniExtract1stU8toI16_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtract2ndU8toI16_2x8", &uniExtract2ndU8toI16_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"argLenSub1", &argLenSub1 );
status |= vsi_nn_kernel_gpu_add_param( node,
"packedArgIdx", packedArgIdx );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
default:
break;
}
@ -354,6 +417,16 @@ static vsi_status _query_kernel
input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if ((input_dtype == I8 || input_dtype == U8)
&& output_dtype == I16
&& axis == 2
&& inputs[0]->attr.size[2] < 256
&& image_2d == 0)
{
image_2d = 2;
}
key = HASH_ARGMAX_HASH_KEY( axis, input_dtype, output_dtype, image_2d );
for( i = 0; i < _cnt_of_array(_argmax_evis_kernel_map); i ++ )

View File

@ -85,12 +85,12 @@ typedef enum
#define COMPARISONS_KERNELS_HALF(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \
{ HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 0), \
HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, F16, F16), \
HASH_COMPARISONS_SH_KERNEL_NAME(FUNC_NAME, BF16, BF16), \
SOURCE },
#define COMPARISONS_KERNELS_HALF_2D(FUNC_NAME, TYPE, SRC0_TYPE, SRC1_TYPE, SOURCE) \
{ HASH_COMPARISONS_KEY(TYPE, SRC0_TYPE, SRC1_TYPE, BOOL8, 1), \
HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, F16, F16), \
HASH_COMPARISONS_SH_KERNEL_2D_NAME(FUNC_NAME, BF16, BF16), \
SOURCE },
#define LESS_OP less
@ -396,6 +396,26 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node,
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
@ -403,6 +423,10 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
"uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniDatatoFp32Part1_4x4", &uniDatatoFp32Part1_4x4 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"input0Scale", &input0Scale );
status |= vsi_nn_kernel_gpu_add_param( node,
@ -453,7 +477,7 @@ static vsi_status _query_kernel
int i;
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
output_dtype = output_dtype == I8 ? BOOL8 : output_dtype;
key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d );

View File

@ -301,6 +301,7 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
case _PACK_SELECT_KEY( I8, I8):
case _PACK_SELECT_KEY( I16, I16):
case _PACK_SELECT_KEY( F16, F16):
case _PACK_SELECT_KEY( BF16, BF16):
{
gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
multAndoutZP0[0] = (uint32_t)(M0);
@ -367,6 +368,16 @@ static vsi_status _query_kernel
input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
if (input0_dtype == BF16)
{
input0_dtype = F16;
}
if (output_dtype == BF16)
{
output_dtype = F16;
}
key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, blk_flg );
for( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ )

View File

@ -42,6 +42,7 @@ __BEGIN_DECLS
typedef enum
{
UNARY_SIN,
UNARY_COS,
UNARY_EXP,
UNARY_LOG,
UNARY_ELU,
@ -79,6 +80,7 @@ typedef enum
SOURCE },
#define SIN_OPERATION sin
#define COS_OPERATION cos
#define EXP_OPERATION exp
#define LOG_OPERATION log
#define ELU_OPERATION elu
@ -106,6 +108,17 @@ static const struct {
TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8, I8 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, I8, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16, I16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16, U8 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, F16, I8 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16, I16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I16, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8, U8 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, U8, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8, I8 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, I8, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, F16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, I16 , KERNEL_SOURCE_3D)
TENSOR_UNARY_KERNELS(EXP_OPERATION, UNARY_EXP, F16, U8 , KERNEL_SOURCE_3D)
@ -162,6 +175,17 @@ static const struct {
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, I8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, I8, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(SIN_OPERATION, UNARY_SIN, BF16, BF16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, U8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, F16, I8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, I16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I16, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, U8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, U8, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, I8 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, I8, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(COS_OPERATION, UNARY_COS, BF16, BF16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, F16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, I16 , KERNEL_SOURCE_2D)
TENSOR_UNARY_KERNELS_2D(EXP_OPERATION, UNARY_EXP, F16, U8 , KERNEL_SOURCE_2D)
@ -317,6 +341,7 @@ static const struct {
};
#undef SIN_OPERATION
#undef COS_OPERATION
#undef EXP_OPERATION
#undef LOG_OPERATION
#undef ELU_OPERATION
@ -443,6 +468,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
switch( pack_key )
{
case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ):
case _PACK_SELECT_KEY( UNARY_ELU, BF16, BF16 ):
@ -736,6 +762,7 @@ OnError:
REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG )
REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU )

View File

@ -64,6 +64,28 @@ __BEGIN_DECLS
#define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0")
#define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_batch_U8toU8")
#define VX_KERNEL_NAME_GATHER_BATCH_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_batch_I8toI8")
#define VX_KERNEL_NAME_GATHER_BATCH_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_batch_I16toI16")
#define VX_KERNEL_NAME_GATHER_BATCH_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_F16toF16")
#define VX_KERNEL_NAME_GATHER_BATCH_I8TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_I8toF16")
#define VX_KERNEL_NAME_GATHER_BATCH_I16TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_I16toF16")
#define VX_KERNEL_NAME_GATHER_BATCH_F16TOI8 CVIVANTE_NAMESPACE("evis.gather_batch_F16toI8")
#define VX_KERNEL_NAME_GATHER_BATCH_F16TOI16 CVIVANTE_NAMESPACE("evis.gather_batch_F16toI16")
#define VX_KERNEL_NAME_GATHER_BATCH_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_U8toF16")
#define VX_KERNEL_NAME_GATHER_BATCH_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_batch_F16toU8")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_batch_U8toU8_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_batch_I8toI8_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_batch_I16toI16_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_F16toF16_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I8TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_I8toF16_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_I16TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_I16toF16_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOI8 CVIVANTE_NAMESPACE("evis.gather_batch_F16toI8_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOI16 CVIVANTE_NAMESPACE("evis.gather_batch_F16toI16_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_U8TOF16 CVIVANTE_NAMESPACE("evis.gather_batch_U8toF16_axis0")
#define VX_KERNEL_NAME_GATHER_BATCH_AXIS0_F16TOU8 CVIVANTE_NAMESPACE("evis.gather_batch_F16toU8_axis0")
#define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8 CVIVANTE_NAMESPACE("evis.gather_U8toU8_array")
#define VX_KERNEL_NAME_GATHER_ARRAY_I8TOI8 CVIVANTE_NAMESPACE("evis.gather_I8toI8_array")
#define VX_KERNEL_NAME_GATHER_ARRAY_I16TOI16 CVIVANTE_NAMESPACE("evis.gather_I16toI16_array")
@ -77,31 +99,43 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_1 "gather"
#define KERNEL_SOURCE_2 "gather_mix"
#define KERNEL_SOURCE_3 "gather_array"
#define KERNEL_SOURCE_4 "gather_batch"
#define KERNEL_SOURCE_5 "gather_mix_batch"
// Add kernel hashtable here
#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0, _is_max) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0 << 4) | (_is_max))
#define HASH_GATHER_KEY(_in0_type, _in1_type, _out_type, _axis0, _max, _batch) \
((_in0_type << 24) | (_in1_type << 16) | (_out_type << 8) | (_axis0 << 6) | (_max << 4) | (_batch))
#define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0, 0), \
VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0, 0), \
VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1, 0), \
VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_AXIS0_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1), \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1, 0), \
VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_BATCH_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0, 1), \
VX_KERNEL_NAME_GATHER_BATCH_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
#define TENSOR_GATHER_BATCH_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
{ HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0, 1), \
VX_KERNEL_NAME_GATHER_BATCH_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
SOURCE },
static const struct {
uint32_t key;
char* function_name;
@ -136,6 +170,26 @@ static const struct {
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I8, I32, I8, KERNEL_SOURCE_3)
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I16, I32, I16, KERNEL_SOURCE_3)
TENSOR_GATHER_AXIS0_ARRAY_KERNELS(F16, I32, F16, KERNEL_SOURCE_3)
TENSOR_GATHER_BATCH_KERNELS(U8, I32, U8, KERNEL_SOURCE_4)
TENSOR_GATHER_BATCH_KERNELS(I8, I32, I8, KERNEL_SOURCE_4)
TENSOR_GATHER_BATCH_KERNELS(I16, I32, I16, KERNEL_SOURCE_4)
TENSOR_GATHER_BATCH_KERNELS(F16, I32, F16, KERNEL_SOURCE_4)
TENSOR_GATHER_BATCH_KERNELS(I8, I32, F16, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_KERNELS(I16, I32, F16, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_KERNELS(F16, I32, I8, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_KERNELS(F16, I32, I16, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_KERNELS(U8, I32, F16, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_KERNELS(F16, I32, U8, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(U8, I32, U8, KERNEL_SOURCE_4)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(I8, I32, I8, KERNEL_SOURCE_4)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(I16, I32, I16, KERNEL_SOURCE_4)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, F16, KERNEL_SOURCE_4)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(I8, I32, F16, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(I16, I32, F16, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, I8, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, I16, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(U8, I32, F16, KERNEL_SOURCE_5)
TENSOR_GATHER_BATCH_AXIS0_KERNELS(F16, I32, U8, KERNEL_SOURCE_5)
};
/*
@ -158,6 +212,7 @@ static vsi_status get_gather_tensor_reshape_size
vsi_nn_tensor_t ** inputs,
vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
vsi_size_t block_size,
vsi_size_t batch_dims,
uint32_t idxFlg,
int32_t* arrayFlg
)
@ -167,13 +222,19 @@ static vsi_status get_gather_tensor_reshape_size
vsi_size_t *input_size = inputs[0]->attr.size;
uint32_t i = 0;
vsi_size_t elementCnt = 1;
vsi_size_t outerCnt = 1;
#define VSI_NN_MAX_IMAGE_WIDTH (65536)
for(i = 0; i < dims_num; ++i)
for(i = 0; i < dims_num - batch_dims; ++i)
{
elementCnt *= input_size[i];
}
for(; i < dims_num; ++i)
{
outerCnt *= input_size[i];
}
for(i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
{
sizes[i] = 1;
@ -182,13 +243,14 @@ static vsi_status get_gather_tensor_reshape_size
if (idxFlg && elementCnt < VSI_NN_MAX_IMAGE_WIDTH)
{
sizes[0] = elementCnt;
sizes[1] = 1;
sizes[1] = outerCnt;
status = VSI_SUCCESS;
}
else
{
sizes[0] = block_size;
sizes[1] = elementCnt / block_size;
sizes[2] = outerCnt;
if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH)
{
arrayFlg[0] = 1;
@ -222,6 +284,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
int32_t block_num = 0;
int32_t indices_num = 1;
uint32_t input_dims1 = 0;
int32_t batch = 1;
vx_uint32 i = 0;
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
vsi_size_array_t * input1_shape = NULL;
@ -283,7 +346,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
input1_shape = attr[1]->shape;
input_dims1 = (uint32_t)input1_shape->size;
for (i = 0; i < input_dims1; i++)
for (i = 0; i < input_dims1 - 1; i++)
{
indices_num *= (int32_t)(input1_shape->data[i]);
}
@ -376,6 +439,11 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
#undef _PACK_SELECT_KEY
status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num);
if (attr[2]->shape->size > 2)
{
batch = (int32_t)attr[2]->shape->data[2];
status = vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
}
CHECK_STATUS_FAIL_GOTO(status, OnError );
OnError:
@ -415,6 +483,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
int32_t block_num = 0;
int32_t indices_num = 1;
int32_t batch = 1;
uint32_t input_dims1 = 0;
vx_uint32 i = 0;
vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
@ -475,10 +544,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
input1_shape = attr[1]->shape;
input_dims1 = (uint32_t)input1_shape->size;
for (i = 0; i < input_dims1; i++)
for (i = 0; i < input_dims1 - 1; i++)
{
indices_num *= (int32_t)(input1_shape->data[i]);
}
batch = (int32_t)(input1_shape->data[input_dims1 - 1]);
shaderParam.global_scale[0] = 4;
shaderParam.global_scale[1] = 1;
@ -486,7 +556,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
shaderParam.global_size[0] = gpu_align_p2((indices_num + shaderParam.global_scale[0] - 1)
/ shaderParam.global_scale[0], 4);
shaderParam.global_size[1] = block_num;
shaderParam.global_size[2] = 1;
shaderParam.global_size[2] = batch;
status = vsi_nn_kernel_gpu_config( node, &shaderParam );
CHECK_STATUS_FAIL_GOTO(status, OnError);
@ -585,6 +655,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
#undef _PACK_SELECT_KEY
status = vsi_nn_kernel_gpu_add_param(node, "indices_num", &indices_num);
if (attr[2]->shape->size > 2)
{
status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
}
CHECK_STATUS_FAIL_GOTO(status, OnError );
OnError:
@ -617,7 +691,8 @@ static vsi_status _query_kernel
vsi_nn_kernel_t* kernel,
const vsi_nn_kernel_param_t * params,
int32_t axis,
int32_t is_array
int32_t is_array,
int32_t is_batch
)
{
vsi_status status = VSI_FAILURE;
@ -638,7 +713,7 @@ static vsi_status _query_kernel
output_dtype = F16;
}
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array);
key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array, is_batch);
for( i = 0; i < _cnt_of_array(gather_map); i ++ )
{
@ -688,25 +763,30 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
vsi_nn_kernel_node_t node = NULL;
vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" );
int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" );
int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
int32_t batch_dims = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
int32_t axis0_flg = 0;
int32_t is_array = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
int32_t is_batch = batch_dims > 0 ? 1 : 0;
vsi_size_t rs_dim = batch_dims == 0 ? 2 : 3;
int32_t i = 0;
if (axis == 0)
{
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0, &is_array);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0, &is_array);
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, batch_dims, 0, &is_array);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], batch_dims, 0, &is_array);
axis0_flg = 1;
}
else
{
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0, &is_array);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &is_array);
status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, batch_dims, 0, &is_array);
status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, batch_dims, 1, &is_array);
status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, batch_dims, 0, &is_array);
axis0_flg = 0;
}
#undef VSI_NN_MAX_BLOCK_SIZE
@ -715,38 +795,45 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
inputs[0], shapes[0], rs_dim );
reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
inputs[1], shapes[1], 2 );
reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
outputs[0], shapes[2], rs_dim );
if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
outputs[0]->attr.dim_num ) )
{
return NULL;
}
status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array);
status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array, is_batch);
if ( VSI_SUCCESS == status)
{
node = vsi_nn_kernel_create_node( graph, kernel );
if ( node )
{
uint32_t index = 0;
#define RESHAPE_DIM 2
uint32_t index = 3;
/* Pass parameters to node. */
tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], RESHAPE_DIM );
tmp_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shapes[1], RESHAPE_DIM );
tmp_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], RESHAPE_DIM );
#undef RESHAPE_DIM
vsi_nn_kernel_node_pack_io( tmp_params, _GATHER_PARAM_NUM,
reshape_tensors, 2, &reshape_tensors[2], 1 );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM );
CHECK_STATUS(status);
vsi_nn_kernel_tensor_release( &tmp_params[0] );
vsi_nn_kernel_tensor_release( &tmp_params[1] );
vsi_nn_kernel_tensor_release( &tmp_params[2] );
vsi_nn_kernel_scalar_release( &tmp_params[3] );
vsi_nn_kernel_scalar_release( &tmp_params[4] );
vsi_nn_kernel_scalar_release( &tmp_params[5] );
}
}
for (i = 0; i < 3; i++)
{
vsi_safe_release_tensor(reshape_tensors[i]);
}
return node;
} /* _setup() */

View File

@ -22,7 +22,6 @@
*
*****************************************************************************/
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -994,44 +993,6 @@ static vsi_status _query_kernel
return status;
} /* _query_kernel() */
static int32_t _optimize_gn_shape
(
vsi_nn_tensor_t ** inputs,
vsi_size_t group_size,
int32_t group_num,
vsi_size_t* opt_shape,
int32_t* is2D_flg
)
{
vsi_status status = VSI_SUCCESS;
vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
vsi_size_t new_rank = 0;
group_shape[0] = inputs[0]->attr.size[0];
group_shape[1] = inputs[0]->attr.size[1];
group_shape[2] = group_size;
vsi_nn_kernel_optimize_element_shape( group_shape, 3, opt_shape, &new_rank );
if (opt_shape[1] == 1)
{
opt_shape[1] = group_num;
opt_shape[2] = 1;
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
is2D_flg[0] = 1;
}
else if (new_rank == 2)
{
opt_shape[2] = group_num;
opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
}
else
{
status = VSI_FAILURE;
}
return status;
}
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
@ -1077,11 +1038,13 @@ static vsi_nn_kernel_node_t _setup
return NULL;
}
status = _optimize_gn_shape(inputs, group_size, group_num, new_shape, &is2D_flg);
status = vsi_nn_kernel_optimize_group_norm_shape( (const vsi_size_t*)inputs[0]->attr.size,
inputs[0]->attr.dim_num, group_num, 0, new_shape);
if ( VSI_SUCCESS != status )
{
goto final;
}
is2D_flg = (new_shape[2] == 1) && ((int32_t)new_shape[1] == group_num);
rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);

View File

@ -1004,12 +1004,15 @@ static vsi_nn_kernel_node_t _setup
uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
uint32_t hashkey = 0;
int32_t i = 0;
uint32_t rank = outputs[0]->attr.dim_num;
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
int32_t reshape_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
int32_t reshape_flg = outputs[0]->attr.size[1] * outputs[0]->attr.size[2] < GPU_TENSOR_MAX_WIDTH
&& rank > 2;
// Check if gpu can support the size
if ( !vsi_nn_kernel_gpu_check_shape(
outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
outputs[0]->attr.size, outputs[0]->attr.dim_num ) ||
rank > 4 )
{
return NULL;
}

View File

@ -76,9 +76,15 @@ static const _kernel_map_type _logical_ops_kernel_map[] =
PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR, I8, I8, "or"),
PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, I8, I8, "and"),
PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, I8, I8, "xor"),
PACK_KERNEL_MAP(VSI_NN_LOGICAL_OR, BF16, I8, "or"),
PACK_KERNEL_MAP(VSI_NN_LOGICAL_AND, BF16, I8, "and"),
PACK_KERNEL_MAP(VSI_NN_LOGICAL_XOR, BF16, I8, "xor"),
PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR, I8, I8, "or"),
PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, I8, I8, "and"),
PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, I8, I8, "xor"),
PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_OR, BF16, I8, "or"),
PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_AND, BF16, I8, "and"),
PACK_KERNEL_MAP_2D(VSI_NN_LOGICAL_XOR, BF16, I8, "xor"),
};
@ -159,6 +165,22 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer)
status = vsi_nn_kernel_gpu_add_param( node, "uniMulShortMinus1toFp16_2x8", &uniMulShortMinus1toFp16_2x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (BF16 == input_dtype)
{
gpu_dp_inst_t uniConvertInt16toInt8_2x8 = {{
0x11111111, // TCfg
0x00000000, // ASelt
0x03020100, 0x07060504, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000700, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniConvertInt16toInt8_2x8", &uniConvertInt16toInt8_2x8);
CHECK_STATUS_FAIL_GOTO(status, final );
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
CHECK_STATUS_FAIL_GOTO(status, final );
@ -209,9 +231,13 @@ static vsi_status _query_kernel
return VSI_FAILURE;
}
if (BOOL8 == in_dtype && BOOL8 == out_dtype)
if (BOOL8 == in_dtype)
{
in_dtype = I8;
}
if (BOOL8 == out_dtype)
{
out_dtype = I8;
}

View File

@ -56,6 +56,7 @@ __BEGIN_DECLS
#define KERNEL_SOURCE_12 "matrixmul_u8u8_f16"
#define KERNEL_SOURCE_13 "matrixmul_i16"
#define KERNEL_SOURCE_14 "matrixmul_f16i16_i16"
#define KERNEL_SOURCE_15 "matrixmul_bf16"
#define HASH_MATRIX_MUL_KEY(_input0_type, _input1_type, _output_type, _trans_a, _trans_b) \
((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_trans_a << 4) | (_trans_b))
@ -110,6 +111,7 @@ static const struct {
TENSOR_MATRIX_MUL_KERNELS(I8, F16, F16, KERNEL_SOURCE_8)
TENSOR_MATRIX_MUL_KERNELS(I16, F16, F16, KERNEL_SOURCE_8)
TENSOR_MATRIX_MUL_KERNELS(F16, F16, F16, KERNEL_SOURCE_2)
TENSOR_MATRIX_MUL_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15)
TENSOR_MATRIX_MUL_KERNELS(F16, F16, U8, KERNEL_SOURCE_11)
TENSOR_MATRIX_MUL_KERNELS(F16, F16, I8, KERNEL_SOURCE_11)
TENSOR_MATRIX_MUL_KERNELS(F16, F16, I16, KERNEL_SOURCE_11)
@ -119,6 +121,7 @@ static const struct {
TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8, U8, KERNEL_SOURCE_4)
TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, F16, KERNEL_SOURCE_5)
TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8, U8, U8, KERNEL_SOURCE_5)
TENSOR_MATRIX_MUL_TRANSB_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15)
TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8, U8, U8, KERNEL_SOURCE_7)
TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8, I8, I8, KERNEL_SOURCE_7)
TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, I16, I16, KERNEL_SOURCE_7)
@ -126,6 +129,7 @@ static const struct {
TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8, F16, I8, KERNEL_SOURCE_7)
TENSOR_MATRIX_MUL_TRANSA_KERNELS(I16, F16, I16, KERNEL_SOURCE_7)
TENSOR_MATRIX_MUL_TRANSA_KERNELS(F16, F16, F16, KERNEL_SOURCE_7)
TENSOR_MATRIX_MUL_TRANSA_KERNELS(BF16,BF16,BF16, KERNEL_SOURCE_15)
};
/*
@ -587,6 +591,36 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
0x00000600, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
float scaleIn0divOut = src0Scale / dstScale;
float scaleIn1divOut = src1Scale / dstScale;
@ -936,6 +970,22 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 0, 0 ):
case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 0, 1 ):
case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 1, 0 ):
case _PACK_SELECT_KEY( BF16, BF16, BF16, 0, 1, 1 ):
case _PACK_SELECT_KEY( BF16, BF16, BF16, 1, 0, 0 ):
case _PACK_SELECT_KEY( BF16, BF16, BF16, 1, 0, 1 ):
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( F16, F16, F16, 0, 0, 1 ):
{
status = vsi_nn_kernel_gpu_add_param( node,

View File

@ -64,6 +64,10 @@ __BEGIN_DECLS
#define KERNEL_NAME_MAXIMUM_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.maximum_F16F16toI8_2D")
#define KERNEL_NAME_MAXIMUM_F16F16TOI16 CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16")
#define KERNEL_NAME_MAXIMUM_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.maximum_F16F16toI16_2D")
#define KERNEL_NAME_MAXIMUM_I16I16TOU8 CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8")
#define KERNEL_NAME_MAXIMUM_I16I16TOU8_2D CVIVANTE_NAMESPACE("evis.maximum_I16I16toU8_2D")
#define KERNEL_NAME_MAXIMUM_U8U8TOI16 CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16")
#define KERNEL_NAME_MAXIMUM_U8U8TOI16_2D CVIVANTE_NAMESPACE("evis.maximum_U8U8toI16_2D")
#define KERNEL_SOURCE_1 "maximum",
#define KERNEL_SOURCE_2 "maximum_fp16",
@ -109,6 +113,7 @@ static const struct {
TENSOR_MAX_KERNELS(F16, F16, I8, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS(I8, I8, I8, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS(U8, U8, U8, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS(U8, U8, I16, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS(I16, I16, I16, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS(F16, F16, U8, KERNEL_SOURCE_2)
@ -120,12 +125,14 @@ static const struct {
TENSOR_MAX_KERNELS(I16, F16, I16, KERNEL_SOURCE_3)
TENSOR_MAX_KERNELS(I16, F16, F16, KERNEL_SOURCE_3)
TENSOR_MAX_KERNELS(F16, F16, I16, KERNEL_SOURCE_3)
TENSOR_MAX_KERNELS(I16, I16, U8, KERNEL_SOURCE_3)
TENSOR_MAX_KERNELS_2D_HALF(F16, F16, F16, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS_2D(U8, U8, I16, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_1)
TENSOR_MAX_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_2)
@ -137,6 +144,7 @@ static const struct {
TENSOR_MAX_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_3)
TENSOR_MAX_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_3)
TENSOR_MAX_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3)
TENSOR_MAX_KERNELS_2D(I16, I16, U8, KERNEL_SOURCE_3)
};
static vx_param_description_t kernel_param_def[] =
@ -192,6 +200,14 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
in0_fl = (uint8_t)attr[0]->dfp.fl;
if (in0_fl > 0)
{
src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl);
}
else
{
src0Scale = (float)((int64_t)1 << -in0_fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@ -203,6 +219,14 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
in1_fl = (uint8_t)attr[1]->dfp.fl;
if (in1_fl > 0)
{
src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl);
}
else
{
src0Scale = (float)((int64_t)1 << -in1_fl);
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@ -242,7 +266,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
attr[1]->dtype, attr[2]->dtype );
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
|| ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
|| ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16))
|| (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) )
{
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
@ -384,6 +409,8 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
case _PACK_SELECT_KEY( U8, U8, U8 ):
case _PACK_SELECT_KEY( U8, F16, U8 ):
case _PACK_SELECT_KEY( F16, F16, U8 ):
case _PACK_SELECT_KEY( U8, U8, I16 ):
case _PACK_SELECT_KEY( I16, I16, U8 ):
{
uint16_t M0 = 0;
uint16_t M1 = 0;
@ -427,12 +454,15 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
CHECK_STATUS_FAIL_GOTO(status, final );
if (attr[0]->dtype == U8)
if (attr[0]->dtype == U8 || attr[0]->dtype == I16)
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
if (attr[0]->dtype != I16)
{
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
}
status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
@ -461,8 +491,11 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
status = vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
if (attr[0]->dtype != I16)
{
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
}
CHECK_STATUS_FAIL_GOTO(status, final );
}
}
@ -751,7 +784,6 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
tmp_inputs, 2, outputs, 1 );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
}
}
return node;
@ -760,4 +792,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( maximum, _setup )

View File

@ -64,6 +64,10 @@ __BEGIN_DECLS
#define KERNEL_NAME_MINIMUM_F16F16TOI8_2D CVIVANTE_NAMESPACE("evis.minimum_F16F16toI8_2D")
#define KERNEL_NAME_MINIMUM_F16F16TOI16 CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16")
#define KERNEL_NAME_MINIMUM_F16F16TOI16_2D CVIVANTE_NAMESPACE("evis.minimum_F16F16toI16_2D")
#define KERNEL_NAME_MINIMUM_I16I16TOU8 CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8")
#define KERNEL_NAME_MINIMUM_I16I16TOU8_2D CVIVANTE_NAMESPACE("evis.minimum_I16I16toU8_2D")
#define KERNEL_NAME_MINIMUM_U8U8TOI16 CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16")
#define KERNEL_NAME_MINIMUM_U8U8TOI16_2D CVIVANTE_NAMESPACE("evis.minimum_U8U8toI16_2D")
#define KERNEL_SOURCE_1 "minimum",
#define KERNEL_SOURCE_2 "minimum_fp16",
@ -109,6 +113,7 @@ static const struct {
TENSOR_MIN_KERNELS(F16, F16, I8, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS(I8, I8, I8, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS(U8, U8, U8, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS(U8, U8, I16, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS(I16, I16, I16, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS(F16, F16, U8, KERNEL_SOURCE_2)
@ -120,12 +125,14 @@ static const struct {
TENSOR_MIN_KERNELS(I16, F16, I16, KERNEL_SOURCE_3)
TENSOR_MIN_KERNELS(I16, F16, F16, KERNEL_SOURCE_3)
TENSOR_MIN_KERNELS(F16, F16, I16, KERNEL_SOURCE_3)
TENSOR_MIN_KERNELS(I16, I16, U8, KERNEL_SOURCE_3)
TENSOR_MIN_KERNELS_2D_HALF(F16, F16, F16, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS_2D_HALF(BF16, BF16, BF16, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS_2D(F16, F16, I8, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS_2D(I8, I8, I8, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS_2D(U8, U8, U8, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS_2D(U8, U8, I16, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS_2D(I16, I16, I16, KERNEL_SOURCE_1)
TENSOR_MIN_KERNELS_2D(F16, F16, U8, KERNEL_SOURCE_2)
@ -137,6 +144,7 @@ static const struct {
TENSOR_MIN_KERNELS_2D(I16, F16, I16, KERNEL_SOURCE_3)
TENSOR_MIN_KERNELS_2D(I16, F16, F16, KERNEL_SOURCE_3)
TENSOR_MIN_KERNELS_2D(F16, F16, I16, KERNEL_SOURCE_3)
TENSOR_MIN_KERNELS_2D(I16, I16, U8, KERNEL_SOURCE_3)
};
static vx_param_description_t kernel_param_def[] =
@ -192,6 +200,14 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
in0_fl = (uint8_t)attr[0]->dfp.fl;
if (in0_fl > 0)
{
src0Scale = 1.0f / (float) ((int64_t)1 << in0_fl);
}
else
{
src0Scale = (float)((int64_t)1 << -in0_fl);
}
}
else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@ -203,6 +219,14 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
{
in1_fl = (uint8_t)attr[1]->dfp.fl;
if (in1_fl > 0)
{
src0Scale = 1.0f / (float) ((int64_t)1 << in1_fl);
}
else
{
src0Scale = (float)((int64_t)1 << -in1_fl);
}
}
else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
|| attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
@ -242,7 +266,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
attr[1]->dtype, attr[2]->dtype );
if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
|| ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
|| ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16))
|| (attr[0]->dtype == I16 && attr[1]->dtype == I16 && attr[2]->dtype == U8) )
{
gpu_param.global_scale[0] = 8;
gpu_param.global_scale[1] = 1;
@ -384,6 +409,8 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
case _PACK_SELECT_KEY( U8, U8, U8 ):
case _PACK_SELECT_KEY( U8, F16, U8 ):
case _PACK_SELECT_KEY( F16, F16, U8 ):
case _PACK_SELECT_KEY( U8, U8, I16 ):
case _PACK_SELECT_KEY( I16, I16, U8 ):
{
uint16_t M0 = 0;
uint16_t M1 = 0;
@ -427,12 +454,15 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
status = vsi_nn_kernel_gpu_add_param( node, "multAndoutZP1", &multAndoutZP1 );
CHECK_STATUS_FAIL_GOTO(status, final );
if (attr[0]->dtype == U8)
if (attr[0]->dtype == U8 || attr[0]->dtype == I16)
{
status = vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift0_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
if (attr[0]->dtype != I16)
{
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift0_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
}
status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
CHECK_STATUS_FAIL_GOTO(status, final );
}
@ -461,8 +491,11 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_Hi_2x8, postShift1 );
status = vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift1_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
if (attr[0]->dtype != I16)
{
status |= vsi_nn_kernel_gpu_add_param( node,
"uniU8MulAndPostShift1_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8 );
}
CHECK_STATUS_FAIL_GOTO(status, final );
}
}
@ -751,7 +784,6 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
tmp_inputs, 2, outputs, 1 );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
}
}
return node;
@ -760,4 +792,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( minimum, _setup )

View File

@ -101,14 +101,17 @@ static const struct {
TENSOR_MOMENTS_KERNELS(I8, F16, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS(I16, F16, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS(F16, F16, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS(BF16,BF16,0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS(U8, F16, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS(I8, F16, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS(I16, F16, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS(F16, F16, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS(BF16,BF16,1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS(U8, F16, 2, KERNEL_SOURCE_3)
TENSOR_MOMENTS_KERNELS(I8, F16, 2, KERNEL_SOURCE_3)
TENSOR_MOMENTS_KERNELS(I16, F16, 2, KERNEL_SOURCE_3)
TENSOR_MOMENTS_KERNELS(F16, F16, 2, KERNEL_SOURCE_3)
TENSOR_MOMENTS_KERNELS(BF16,BF16,2, KERNEL_SOURCE_3)
TENSOR_MOMENTS_KERNELS(U8, U8, 0, KERNEL_SOURCE_6)
TENSOR_MOMENTS_KERNELS(U8, U8, 1, KERNEL_SOURCE_6)
TENSOR_MOMENTS_KERNELS(U8, U8, 2, KERNEL_SOURCE_6)
@ -116,26 +119,31 @@ static const struct {
TENSOR_MOMENTS_TWO_AXIS_KERNELS(I8, F16, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(I16, F16, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(BF16,BF16,0, 1, KERNEL_SOURCE_7)
TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, U8, 0, 1, KERNEL_SOURCE_6)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F16, 0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(I8, F16, 0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(I16, F16, 0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(BF16,BF16,0, 1, 2, KERNEL_SOURCE_5)
TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, U8, 0, 1, 2, KERNEL_SOURCE_7)
TENSOR_MOMENTS_KERNELS_2D(U8, F16, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS_2D(I8, F16, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS_2D(I16, F16, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS_2D(F16, F16, 0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS_2D(BF16,BF16,0, KERNEL_SOURCE_1)
TENSOR_MOMENTS_KERNELS_2D(U8, F16, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS_2D(I8, F16, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS_2D(I16, F16, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS_2D(F16, F16, 1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS_2D(BF16,BF16,1, KERNEL_SOURCE_2)
TENSOR_MOMENTS_KERNELS_2D(U8, U8, 0, KERNEL_SOURCE_6)
TENSOR_MOMENTS_KERNELS_2D(U8, U8, 1, KERNEL_SOURCE_6)
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8, F16, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I8, F16, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(I16, F16, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(F16, F16, 0, 1, KERNEL_SOURCE_4)
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(BF16,BF16,0, 1, KERNEL_SOURCE_7)
TENSOR_MOMENTS_TWO_AXIS_KERNELS_2D(U8, U8, 0, 1, KERNEL_SOURCE_6)
};
@ -461,6 +469,36 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
switch( pack_key )
{
@ -494,6 +532,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, BF16, 1, 0):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
&uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
&uniConvBF16toF32_Part1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
&uniExtractOddData_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U8, F16, 1, 1):
case _PACK_SELECT_KEY( I8, F16, 1, 1):
case _PACK_SELECT_KEY( I16, F16, 1, 1):
@ -518,6 +568,16 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, BF16, 1, 1):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8",
&uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
&uniExtractOddData_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U8, F16, 1, 2):
case _PACK_SELECT_KEY( I8, F16, 1, 2):
case _PACK_SELECT_KEY( I16, F16, 1, 2):
@ -542,6 +602,15 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, BF16, 1, 2):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
&uniExtractOddData_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U8, F16, 2, 0):
case _PACK_SELECT_KEY( I8, F16, 2, 0):
case _PACK_SELECT_KEY( I16, F16, 2, 0):
@ -597,6 +666,18 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, BF16, 2, 0):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
&uniConvBF16toF32_Part1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
&uniExtractOddData_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( F16, F16, 3, 0):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
@ -608,6 +689,19 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( BF16, BF16, 3, 0):
{
status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8",
&uniConvBF16toF32_Part1_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractOddData_2x8",
&uniExtractOddData_2x8);
status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
status |= vsi_nn_kernel_gpu_add_param(node, "channel", &chn);
CHECK_STATUS_FAIL_GOTO(status, OnError );
}
break;
case _PACK_SELECT_KEY( U8, U8, 1, 0):
case _PACK_SELECT_KEY( U8, U8, 1, 1):
case _PACK_SELECT_KEY( U8, U8, 1, 2):

View File

@ -68,27 +68,29 @@ typedef struct
static const _kernel_map_type _one_hot_kernel_map[] =
{
// Register kernel here
PACK_ONE_HOT_KERNEL_3D( U8, U8 ),
PACK_ONE_HOT_KERNEL_3D( U8, F16 ),
PACK_ONE_HOT_KERNEL_3D( I8, I8 ),
PACK_ONE_HOT_KERNEL_3D( I8, F16 ),
PACK_ONE_HOT_KERNEL_3D( I16, I16 ),
PACK_ONE_HOT_KERNEL_3D( I16, F16 ),
PACK_ONE_HOT_KERNEL_3D( F16, F16 ),
PACK_ONE_HOT_KERNEL_3D( F16, I16 ),
PACK_ONE_HOT_KERNEL_3D( F16, U8 ),
PACK_ONE_HOT_KERNEL_3D( F16, I8 ),
PACK_ONE_HOT_KERNEL_3D( U8, U8 ),
PACK_ONE_HOT_KERNEL_3D( U8, F16 ),
PACK_ONE_HOT_KERNEL_3D( I8, I8 ),
PACK_ONE_HOT_KERNEL_3D( I8, F16 ),
PACK_ONE_HOT_KERNEL_3D( I16, I16 ),
PACK_ONE_HOT_KERNEL_3D( I16, F16 ),
PACK_ONE_HOT_KERNEL_3D( F16, F16 ),
PACK_ONE_HOT_KERNEL_3D( F16, I16 ),
PACK_ONE_HOT_KERNEL_3D( F16, U8 ),
PACK_ONE_HOT_KERNEL_3D( F16, I8 ),
PACK_ONE_HOT_KERNEL_3D( BF16, BF16 ),
PACK_ONE_HOT_KERNEL_2D( U8, U8 ),
PACK_ONE_HOT_KERNEL_2D( U8, F16 ),
PACK_ONE_HOT_KERNEL_2D( I8, I8 ),
PACK_ONE_HOT_KERNEL_2D( I8, F16 ),
PACK_ONE_HOT_KERNEL_2D( I16, I16 ),
PACK_ONE_HOT_KERNEL_2D( I16, F16 ),
PACK_ONE_HOT_KERNEL_2D( F16, F16 ),
PACK_ONE_HOT_KERNEL_2D( F16, I16 ),
PACK_ONE_HOT_KERNEL_2D( F16, U8 ),
PACK_ONE_HOT_KERNEL_2D( F16, I8 ),
PACK_ONE_HOT_KERNEL_2D( U8, U8 ),
PACK_ONE_HOT_KERNEL_2D( U8, F16 ),
PACK_ONE_HOT_KERNEL_2D( I8, I8 ),
PACK_ONE_HOT_KERNEL_2D( I8, F16 ),
PACK_ONE_HOT_KERNEL_2D( I16, I16 ),
PACK_ONE_HOT_KERNEL_2D( I16, F16 ),
PACK_ONE_HOT_KERNEL_2D( F16, F16 ),
PACK_ONE_HOT_KERNEL_2D( F16, I16 ),
PACK_ONE_HOT_KERNEL_2D( F16, U8 ),
PACK_ONE_HOT_KERNEL_2D( F16, I8 ),
PACK_ONE_HOT_KERNEL_2D( BF16, BF16 ),
};
@ -274,6 +276,51 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
"depth", &depth );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
case BF16:
{
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractInteger_2x8 = {{
0x33333333, // TCfg
0x11110000, // ASelt
0x03020100, 0x03020100, // ABin
0x00000000, // BSelt
0x00000000, 0x00000000, // BBin
0x00002400, // AccumType, ConstantType, and PostShift
0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
}, GPU_DP_TYPE_16 };
status = vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtract8Data_2x8", &uniExtractInteger_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"depth", &depth );
CHECK_STATUS_FAIL_GOTO(status, final );
}
break;
default:
break;
}

View File

@ -98,7 +98,6 @@ static const struct {
PRELU_KERNELS_2D(I8, F16, F16, _2D, KERNEL_SOURCE0)
PRELU_KERNELS_2D(U8, U8, U8, _2D, KERNEL_SOURCE0)
PRELU_KERNELS_2D(U8, U8, F16, _2D, KERNEL_SOURCE0)
};
static vx_param_description_t kernel_param_def[] =
@ -199,6 +198,7 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
}
else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
{
out_fl = 1;
outputZP = (float)attr[2]->asymm.zero_point;
input_scale0 = input_scale0 / attr[2]->asymm.scale;
}
@ -628,7 +628,6 @@ static vsi_nn_kernel_node_t _setup
vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
reshape_tensors, 2, &reshape_tensors[2], 1 );
status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
}
}
@ -643,4 +642,3 @@ final:
__END_DECLS
REGISTER_BACKEND_EVIS( prelu, _setup )

View File

@ -51,11 +51,13 @@ typedef enum
UP_2X_HALF,
UP_3X_HALF,
UP_4X_HALF,
UP_8X_HALF,
} _internal_scale_e;
#define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type) "resize_bilinear_"#_input_type
#define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type) "resize_bilinear_"#_input_type"_opt"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_1"
#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(_input_type) "resize_bilinear_"#_input_type"_half_pixel_centers_2"
#define STR(a) #a
// Add kernel hashtable here
@ -81,19 +83,25 @@ typedef enum
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_2x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_4x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_8X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_8X_HALF ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_8x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC2(IN_DTYPE) }
#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
{ RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
"_SAME_3x_upsample_half_pixel_centers"), \
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
_RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC1(IN_DTYPE) }
typedef struct
{
@ -120,6 +128,7 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
PACK_KERNEL_MAP_UP_8X_HALF(U8, U8),
};
@ -224,6 +233,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
vsi_bool is_2x_up_kernel = FALSE;
vsi_bool is_3x_up_kernel = FALSE;
vsi_bool is_4x_up_kernel = FALSE;
vsi_bool is_8x_up_kernel = FALSE;
input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@ -280,6 +290,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
is_8x_up_kernel = (8 * in_width == out_width) && (8 * in_height == out_height);
}
if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
@ -330,7 +341,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
outputZP = 0;
}
if (is_2x_up_kernel || is_4x_up_kernel)
if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
{
gpu_param.global_scale[0] = 16;
gpu_param.global_scale[1] = 1;
@ -479,6 +490,76 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (is_8x_up_kernel)
{
gpu_dp_inst_t uniResize8xUp_l00_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l01_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x513f3f31, 0x632d4d23, 0x751b5b15, 0x87096907,
0x09870769, 0x1b75155b, 0x2d63234d, 0x3f51313f // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l10_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l11_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x634d2d23, 0x79373719, 0x8f21410f, 0xa50b4b05,
0x0ba5054b, 0x218f0f41, 0x37791937, 0x4d63232d // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l20_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l21_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x755b1b15, 0x8f41210f, 0xa9272709, 0xc30d2d03,
0x0dc3032d, 0x27a90927, 0x418f0f21, 0x5b75151b // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l30_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x0208c020, 0x08c0208c, 0x44418c02, 0x41944419, 0x94441944, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16};
gpu_dp_inst_t uniResize8xUp_l31_4x8 = {{
0x55555555, 0x55555555, // TCfg
0x44194441, 0x19444194, 0xc8629444, 0x629c8629, 0x9c8629c8, // BinSelect
0x00000708, // AccumType, ConstantType, and PostShift
0x87690907, 0xa54b0b05, 0xc32d0d03, 0xe10f0f01,
0x0fe1010f, 0x2dc3030d, 0x4ba5050b, 0x69870709 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l00_4x8", &uniResize8xUp_l00_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l01_4x8", &uniResize8xUp_l01_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l10_4x8", &uniResize8xUp_l10_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l11_4x8", &uniResize8xUp_l11_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l20_4x8", &uniResize8xUp_l20_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l21_4x8", &uniResize8xUp_l21_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l30_4x8", &uniResize8xUp_l30_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "uniResize8xUp_l31_4x8", &uniResize8xUp_l31_4x8);
status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
CHECK_STATUS_FAIL_GOTO(status, final );
}
else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
{
float dfpScale = input_scale * output_scale;
@ -965,25 +1046,25 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
goto final;
}
if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel)
if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel&& !is_8x_up_kernel)
{
status = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
CHECK_STATUS_FAIL_GOTO(status, final );
}
if (is_2x_up_kernel || is_4x_up_kernel)
if (is_2x_up_kernel || is_4x_up_kernel || is_8x_up_kernel)
{
gpu_param.global_size[0] = gpu_align_p2((out_width + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = depth;
gpu_param.dim = 2;
gpu_param.global_size[0] = gpu_align_p2((out_width + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = depth;
gpu_param.dim = 2;
}
else
{
gpu_param.global_size[0] = gpu_align_p2((out_width + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
gpu_param.global_size[0] = gpu_align_p2((out_width + \
gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
gpu_param.global_size[1] = (out_height + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
gpu_param.global_size[2] = depth / gpu_param.global_scale[2];
}
status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@ -1024,6 +1105,8 @@ static vsi_status _query_kernel
&& (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
vsi_bool is_8x_upsample =(8 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
&& (8 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
_internal_scale_e scale_flag = UP;
in_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
@ -1032,6 +1115,7 @@ static vsi_status _query_kernel
is_2x_upsample &= (in_dtype == U8);
is_3x_upsample &= (in_dtype == U8);
is_4x_upsample &= (in_dtype == U8);
is_8x_upsample &= (in_dtype == U8);
if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
{
@ -1047,6 +1131,10 @@ static vsi_status _query_kernel
{
scale_flag = UP_4X_HALF;
}
else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_8x_upsample)
{
scale_flag = UP_8X_HALF;
}
else if (is_same_type && is_evis2)
{
scale_flag = UP_OPT;
@ -1123,7 +1211,6 @@ static vsi_status _query_kernel
}
return status;
} /* _query_kernel() */
static vsi_nn_tensor_t* _create_scale_tensor
@ -1307,4 +1394,3 @@ static vsi_nn_kernel_node_t _setup
__END_DECLS
REGISTER_BACKEND_EVIS( resize_bilinear, _setup )

View File

@ -74,6 +74,7 @@ static const struct {
TENSOR_SCATTER_ND_KERNELS(I32, U8, U8, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_KERNELS(I32, I16, I16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_KERNELS(I32, F16, F16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_KERNELS(I32, BF16,BF16, KERNEL_SOURCE_1)
TENSOR_SCATTER_ND_BIG_KERNELS(I32, I8, I8, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_BIG_KERNELS(I32, U8, U8, KERNEL_SOURCE_2)
TENSOR_SCATTER_ND_BIG_KERNELS(I32, I16, I16, KERNEL_SOURCE_2)
@ -250,8 +251,45 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x01050004, 0x03070206, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
0x11111111, // TCfg
0x01010101, // ASelt
0x05050404, 0x07070606, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16 };
gpu_dp_inst_t uniExtractOddData_2x8 = {{
0x11111111, // TCfg
0x11110000, // ASelt
0x07050301, 0x07050301, // ABin
0x22222222, // BSelt
0x00000000, 0x00000000, // BBin
0x00000600, // AccumType, ConstantType, and PostShift
0x00000001, 0x00000001, 0x00000001, 0x00000001,
0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
}, GPU_DP_TYPE_16};
status = vsi_nn_kernel_gpu_add_param( node,
"uniAccumulateSum_2x8", &uniAccumulateSum_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node,
"uniExtractOddData_2x8", &uniExtractOddData_2x8 );
status |= vsi_nn_kernel_gpu_add_param( node, "index_num", &index_num );
status |= vsi_nn_kernel_gpu_add_param( node, "zeropoint", &output_zp );
status |= vsi_nn_kernel_gpu_add_param( node, "offsetX", &offsetX );

View File

@ -67,6 +67,13 @@ static vsi_status _gpu_register
vsi_nn_kernel_t* kernel
);
static vsi_status _gpu_register_ext
(
vsi_nn_graph_t* graph,
vsi_nn_kernel_t* kernel,
const char** resources
);
static vx_program _create_program_from_executable
(
vsi_nn_graph_t* graph,
@ -79,6 +86,13 @@ static vx_program _create_program_from_code
vsi_nn_kernel_t* kernel
);
static vx_program _create_program_from_code_ext
(
vsi_nn_graph_t* graph,
vsi_nn_kernel_t* kernel,
const char** resources
);
static const uint8_t* _load_internal_executable
(
const char* source_name,
@ -104,6 +118,14 @@ static void _kernel_clear_source
static vsi_bool _check_shader_support(vsi_nn_graph_t* graph);
static vsi_bool vsi_nn_kernel_is_asymmtric_int8
(
vsi_nn_tensor_t** inputs,
size_t input_num,
vsi_nn_tensor_t** outputs,
size_t output_num
);
static vsi_status VX_CALLBACK _kernel_validator
(
vx_node node,
@ -290,7 +312,7 @@ static char* _load_source_code_from_file
size_t read_bytes;
source = NULL;
//TODO: Pack new name
fp = fopen( source_name, "rb" );
fp = vsi_nn_fopen( source_name, "rb" );
if( NULL == fp )
{
VSILOGE("Open program file %s fail.", source_name);
@ -414,6 +436,58 @@ static vx_program _create_program_from_code
return program;
} /* _create_program_from_code() */
static vx_program _create_program_from_code_ext
(
vsi_nn_graph_t* graph,
vsi_nn_kernel_t* kernel,
const char** resources
)
{
const vsi_nn_kernel_source_info_t* source_info;
kernel_program_info_t* program_info;
size_t i;
vx_program program = NULL;
source_info = &kernel->gpu.sources[VSI_NN_GPU_SOURCE_FMT_CODE];
if( source_info->num == 0 )
{
VSILOGE("Not executable source found in kernel.");
return NULL;
}
program_info = (kernel_program_info_t*)malloc(
source_info->num * sizeof(kernel_program_info_t) );
if( !program_info )
{
VSILOGE("Malloc program memory fail.");
return NULL;
}
memset( program_info, 0, source_info->num * sizeof(kernel_program_info_t) );
for( i = 0; i < source_info->num; i ++ )
{
program_info[i].data = (const void*)(resources[i]);
if( !program_info[i].data )
{
program_info[i].reserve_mem = (void*)_load_source_code_from_file(
source_info->data[i], &program_info[i].size );
program_info[i].data = (const void*)program_info[i].reserve_mem;
}
}
program = _create_program( graph->ctx->c, program_info, source_info->num );
if( program_info )
{
for( i = 0; i < source_info->num; i ++ )
{
if( program_info[i].reserve_mem )
{
free( program_info[i].reserve_mem );
}
}
free( program_info );
}
return program;
} /* _create_program_from_code_ext() */
static vx_program _create_program_from_executable
(
vsi_nn_graph_t* graph,
@ -547,6 +621,113 @@ static vsi_status _gpu_register
return status;
} /* _gpu_register() */
static vsi_status _gpu_register_ext
(
vsi_nn_graph_t* graph,
vsi_nn_kernel_t* kernel,
const char** resources
)
{
vsi_status status;
vx_kernel_description_t* info;
vx_kernel obj;
vsi_nn_context_t context;
vx_program program = NULL;
const vsi_nn_gpu_source_fmt_e active_fmt = kernel->gpu.active_source_fmt;
#define MAX_BUILDPROGRAM_LEN 1024
char cmd[MAX_BUILDPROGRAM_LEN] = { 0 };
size_t cost_bytes = 0;
memset( cmd, 0, sizeof(char) * MAX_BUILDPROGRAM_LEN );
context = graph->ctx;
status = VSI_FAILURE;
info = &(kernel->info);
switch( active_fmt )
{
case VSI_NN_GPU_SOURCE_FMT_CODE:
program = _create_program_from_code_ext( graph, kernel,resources );
break;
case VSI_NN_GPU_SOURCE_FMT_EXECUTABLE:
program = _create_program_from_executable( graph, kernel );
break;
default:
VSILOGE("Unknown source format %d", kernel->gpu.active_source_fmt);
break;
}
if( NULL == program )
{
return status;
}
if( context->config.evis.ver == VSI_NN_HW_EVIS_NONE )
{
// set default evis version is 2
if( VSI_NN_KERNEL_TYPE_EVIS == kernel->type )
{
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=2 -D USE_40BITS_VA=%d",
context->config.use_40bits_va );
}
}
else
{
cost_bytes = snprintf( cmd, MAX_BUILDPROGRAM_LEN,
"-cl-viv-vx-extension -D VX_VERSION=%d -D USE_40BITS_VA=%d",
context->config.evis.ver, context->config.use_40bits_va );
}
// Pack build option
if( kernel->gpu.sources[active_fmt].build_option.data )
{
vsi_nn_kernel_build_option_t * option = &kernel->gpu.sources[active_fmt].build_option;
if( MAX_BUILDPROGRAM_LEN - cost_bytes > strlen( option->data ) + 1 )
{
snprintf( &cmd[cost_bytes], MAX_BUILDPROGRAM_LEN - cost_bytes,
" %s", option->data );
}
else
{
VSILOGE("Build option is too long!");
VSI_ASSERT( FALSE );
}
}
status = vxBuildProgram( program, cmd );
if( VSI_SUCCESS != status )
{
VSILOGE("Build program fail.");
return status;
}
obj = vxAddKernelInProgram(
program,
info->name,
info->enumeration,
info->numParams,
info->validate,
info->initialize,
info->deinitialize
);
if( obj )
{
status = _kernel_init_obj( info, obj );
//vxReleaseKernel( &obj );
}
else
{
VSILOGE( "Add kernel %s fail.", info->name );
}
if( program )
{
vxReleaseProgram( &program );
}
return status;
} /* _gpu_register_ext() */
static vsi_status _kernel_init_obj
(
vx_kernel_description_t* info,
@ -620,6 +801,19 @@ vsi_status vsi_nn_kernel_register
return status;
} /* vsi_nn_kernel_register() */
vsi_status vsi_nn_kernel_register_ext
(
vsi_nn_graph_t * graph,
vsi_nn_kernel_t * kernel,
const char** resources
)
{
vsi_status status;
status = VSI_FAILURE;
status = _gpu_register_ext( graph, kernel,resources );
return status;
} /* vsi_nn_kernel_register_ext */
vsi_nn_kernel_node_t vsi_nn_kernel_create_node
(
vsi_nn_graph_t* graph,
@ -667,7 +861,6 @@ vsi_nn_kernel_node_t vsi_nn_kernel_create_node
status = vxGetStatus( (vx_reference)obj );
if (VSI_SUCCESS != status)
{
fprintf(stderr, "\n"); // TODO: This is a hack for driver msg
/* Register kernel */
status = vsi_nn_kernel_register( graph, kernel );
if( VSI_SUCCESS != status )
@ -712,6 +905,92 @@ vsi_nn_kernel_node_t vsi_nn_kernel_create_node
return (vsi_nn_kernel_node_t)node;
} /* vsi_nn_kernel_create_node() */
vsi_nn_kernel_node_t vsi_nn_kernel_create_node_ext
(
vsi_nn_graph_t * graph,
vsi_nn_kernel_t * kernel,
const char** resources
){
vsi_status status;
vx_context ctx;
vx_kernel obj;
vx_node node;
vx_kernel_description_t* info;
info = &(kernel->info);
// Validate kernel
if( !info->initialize )
{
VSILOGE("Kernel %s initializer is NULL", info->name);
return NULL;
}
if( !info->validate )
{
VSILOGE("Kernel %s validator is NULL", info->name);
return NULL;
}
if( !info->deinitialize )
{
VSILOGE("Kernel %s deinitializer is NULL", info->name);
return NULL;
}
if( info->enumeration == KERNEL_ID_PLACEHOLDER )
{
//VSILOGD("Kernel id: %#x, %#x", kernel->unique_id, info->enumeration);
info->enumeration = (vx_enum)kernel->unique_id;
}
ctx = vxGetContext( (vx_reference)graph->g );
obj = vxGetKernelByName( ctx, info->name );
status = vxGetStatus( (vx_reference)obj );
if (VSI_SUCCESS != status)
{
fprintf(stderr, "\n"); // TODO: This is a hack for driver msg
/* Register kernel */
status = vsi_nn_kernel_register_ext( graph, kernel,resources );
if( VSI_SUCCESS != status )
{
VSILOGE( "Register client kernel %s fail with %d.",
info->name, status );
return NULL;
}
else
{
VSILOGD( "Register client kernel %s successfully.",
info->name );
}
/* Load kernel */
obj = vxGetKernelByName( ctx, info->name );
status = vxGetStatus( (vx_reference)obj );
}
if( VSI_SUCCESS != status )
{
VSILOGE( "Load client kernel %s fail with %d.",
info->name, status );
return NULL;
}
node = vxCreateGenericNode( graph->g, obj );
vxReleaseKernel( &obj );
status = vxGetStatus( (vx_reference)node );
if( VSI_SUCCESS != status )
{
VSILOGE( "Load client node from kernel %s fail with %d.",
info->name, status );
return NULL;
}
if( node )
{
// Set default border mode.
vx_border_t border;
border.mode = VX_BORDER_REPLICATE;
border.constant_value.U32 = 0;
status |= vxSetNodeAttribute( node, VX_NODE_BORDER, &border, sizeof(border) );
}
return (vsi_nn_kernel_node_t)node;
} /* vsi_nn_kernel_create_node_ext() */
vsi_status vsi_nn_kernel_node_set_border
(vsi_nn_kernel_node_t node,
vx_border_t* border)
@ -987,7 +1266,8 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
/* Skip evis and cl when disable shader */
if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL)
&& _check_shader_support(graph) == FALSE)
&& ( _check_shader_support(graph) == FALSE ||
vsi_nn_kernel_is_asymmtric_int8(inputs, input_num, outputs, output_num) ) )
{
continue;
}
@ -1292,3 +1572,38 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
return FALSE;
}
static vsi_bool vsi_nn_kernel_is_asymmtric_int8
(
vsi_nn_tensor_t** inputs,
size_t input_num,
vsi_nn_tensor_t** outputs,
size_t output_num
)
{
size_t i = 0;
for (i = 0; i < input_num; i++)
{
if ( inputs[i] &&
inputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
inputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
)
{
return TRUE;
}
}
for (i = 0; i < output_num; i++)
{
if ( outputs[i] &&
outputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
outputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
)
{
return TRUE;
}
}
return FALSE;
}

View File

@ -361,7 +361,6 @@ vsi_bool vsi_nn_kernel_optimize_softmax_shape
return ret;
} /* vsi_nn_kernel_optimize_softmax_shape() */
typedef enum
{
TILE_STATE_AXIS_X = 0,
@ -611,4 +610,47 @@ vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape
*out_rank = vsi_nn_min(dim_num, 3);
return TRUE;
}
vsi_bool vsi_nn_kernel_optimize_group_norm_shape
(
const vsi_size_t* shape, const uint32_t rank, int32_t groups,
int32_t is_sp_kernel, vsi_size_t* out_shape
)
{
vsi_status status = VSI_SUCCESS;
uint32_t i = 0;
vsi_size_t out_rank = 0;
vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
group_shape[0] = shape[0];
group_shape[1] = shape[1];
group_shape[2] = shape[2] / groups;
vsi_nn_kernel_optimize_element_shape( group_shape, 3, out_shape, &out_rank );
if (!is_sp_kernel && out_shape[1] == 1 && out_rank < 3)
{
out_shape[1] = groups;
out_shape[2] = 1;
out_shape[3] = 1;
for (i = 3; i < rank; i++)
{
out_shape[3] = out_shape[3] * shape[i];
}
}
else if (out_rank == 2)
{
out_shape[2] = groups;
out_shape[3] = 1;
for (i = 3; i < rank; i++)
{
out_shape[3] = out_shape[3] * shape[i];
}
}
else
{
status = VSI_FAILURE;
}
return status;
}

View File

@ -0,0 +1,84 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include <float.h>
#include "utils/vsi_nn_dtype_util_prv.h"
#include "vsi_nn_tensor_util.h"
#include "vsi_nn_error.h"
#include "kernel/vsi_nn_kernel.h"
#include "kernel/vsi_nn_kernel_lut.h"
static vsi_nn_kernel_node_t _setup
(
vsi_nn_graph_t * graph,
vsi_nn_tensor_t ** inputs,
size_t input_num,
vsi_nn_tensor_t ** outputs,
size_t output_num,
const vsi_nn_kernel_param_t * params,
vsi_nn_kernel_t * kernel
)
{
vx_node node = NULL;
float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
node = vxBatchNormalizationLayer(
graph->g,
eps,
inputs[1]->t,
inputs[2]->t,
inputs[3]->t,
inputs[4]->t,
inputs[0]->t,
outputs[0]->t
);
return (vsi_nn_kernel_node_t)node;
} /* _setup() */
#define REGISTER_BATCH_NORM_OPENVX_KERNEL(KERNEL_NAME) \
static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num, \
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
) \
{ \
return _setup(graph, inputs, input_num, outputs, output_num, \
params, kernel); \
} \
REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
REGISTER_BATCH_NORM_OPENVX_KERNEL( batch_norm )
#undef REGISTER_BATCH_NORM_OPENVX_KERNEL

View File

@ -181,6 +181,51 @@ static vsi_bool _build_vx_conv3d_param
} /* _build_vx_conv2d_param() */
#endif
#if VX_DECONV_3D_API_SUPPORT
static vsi_bool _build_vx_deconv3d_param
(
vx_nn_deconvolution_3d_params_t * param,
int32_t stride_d, int32_t stride_h, int32_t stride_w,
int32_t pad_d_front, int32_t pad_d_end,
int32_t pad_h_front, int32_t pad_h_end,
int32_t pad_w_front, int32_t pad_w_end,
int32_t outpadding_d, int32_t outpadding_h, int32_t outpadding_w,
int32_t group, vsi_enum overflow_policy,
vsi_enum rounding_policy, vsi_enum down_scale_size_rounding
)
{
VSI_ASSERT( stride_d > 0 );
VSI_ASSERT( stride_h > 0 );
VSI_ASSERT( stride_w > 0 );
VSI_ASSERT( outpadding_d >= 0 );
VSI_ASSERT( outpadding_h >= 0 );
VSI_ASSERT( outpadding_w >= 0 );
VSI_ASSERT( group >= 0 );
param->padding_d_front = (uint32_t)pad_d_front;
param->padding_d_rear = (uint32_t)pad_d_end;
param->padding_h_top = (uint32_t)pad_h_front;
param->padding_h_bottom = (uint32_t)pad_h_end;
param->padding_w_left = (uint32_t)pad_w_front;
param->padding_w_right = (uint32_t)pad_w_end;
param->a_w = outpadding_w;
param->a_h = outpadding_h;
param->a_d = outpadding_d;
param->overflow_policy = (vx_enum)overflow_policy;
param->rounding_policy = (vx_enum)rounding_policy;
param->down_scale_size_rounding = (vx_enum)down_scale_size_rounding;
param->channel_group = group;
param->stride_w = (uint32_t)stride_w;
param->stride_h = (uint32_t)stride_h;
param->stride_d = (uint32_t)stride_d;
return TRUE;
} /* _build_vx_deconv3d_param() */
#endif
static vx_tensor _expand_tensor_dim
( vx_tensor tensor, vsi_ssize_t * shape, size_t rank, vsi_ssize_t expand_dim )
{
@ -242,7 +287,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
vx_node node = NULL;
vx_nn_convolution_params_ext2_t vxparam;
vx_tensor temp_tensors[3] = { NULL };
int i;
uint32_t i = 0;
_build_vx_conv2d_param(
&vxparam,
@ -270,7 +315,6 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
{
uint8_t * data = NULL;
vsi_nn_tensor_attr_t attr;
uint32_t i;
data = vsi_nn_ConvertTensorToData( graph, inputs[1] );
CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
@ -317,7 +361,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
vx_node node = NULL;
vx_nn_convolution_params_ext2_t vxparam;
vx_tensor temp_tensors[3] = { NULL };
int32_t i;
uint32_t i = 0;
vsi_bool need_explicit_padding = FALSE;
_build_vx_conv2d_param(
@ -344,7 +388,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
new_w_shape[0] = inputs[1]->attr.size[0];
new_w_shape[1] = 1;
new_w_shape[2] = 1;
for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++)
for (i = 1; i < inputs[1]->attr.dim_num; i++)
{
new_w_shape[2] *= inputs[1]->attr.size[i];
}
@ -358,7 +402,6 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
{
uint8_t * data = NULL;
vsi_nn_tensor_attr_t attr;
uint32_t i;
data = vsi_nn_ConvertTensorToData( graph, inputs[1] );
CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
@ -576,4 +619,41 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d )
return (vsi_nn_kernel_node_t)node;
} /* depthwise_conv2d*/
#undef REGISTER_CONV_OPENVX_KERNEL
REGISTER_CONV_OPENVX_KERNEL( deconv3d )
{
vx_node node = NULL;
#if VX_DECONV_3D_API_SUPPORT
vx_nn_deconvolution_3d_params_t vxparam;
memset(&vxparam, 0, sizeof(vxparam));
_build_vx_deconv3d_param(
&vxparam,
vsi_nn_kernel_param_get_int32(params, "stride_d"),
vsi_nn_kernel_param_get_int32(params, "stride_h"),
vsi_nn_kernel_param_get_int32(params, "stride_w"),
vsi_nn_kernel_param_get_int32(params, "pad_front"),
vsi_nn_kernel_param_get_int32(params, "pad_end"),
vsi_nn_kernel_param_get_int32(params, "pad_top"),
vsi_nn_kernel_param_get_int32(params, "pad_bottom"),
vsi_nn_kernel_param_get_int32(params, "pad_left"),
vsi_nn_kernel_param_get_int32(params, "pad_right"),
vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
vsi_nn_kernel_param_get_int32(params, "outpadding_h"),
vsi_nn_kernel_param_get_int32(params, "outpadding_w"),
vsi_nn_kernel_param_get_int32(params, "group"),
vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
);
node = vxDeconv3dLayer( graph->g,
inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL,
&vxparam,
sizeof( vxparam),
outputs[0]->t
);
#endif
return (vsi_nn_kernel_node_t)node;
} /* deconv3d */
#undef REGISTER_CONV_OPENVX_KERNEL

View File

@ -0,0 +1,113 @@
/****************************************************************************
*
* Copyright (c) 2021 Vivante Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
*****************************************************************************/
#include "vsi_nn_types.h"
#include "vsi_nn_tensor.h"
#include "vsi_nn_node.h"
#include "vsi_nn_log.h"
#include "vsi_nn_prv.h"
#include "vsi_nn_tensor_util.h"
#include "kernel/vsi_nn_kernel.h"
#include "utils/vsi_nn_dtype_util.h"
#define REGISTER_PAD2_OPENVX_KERNEL( kernel_name ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
); \
REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
static vsi_nn_kernel_node_t _##kernel_name##setup \
( \
vsi_nn_graph_t * graph, \
vsi_nn_tensor_t ** inputs, \
size_t input_num, \
vsi_nn_tensor_t ** outputs, \
size_t output_num,\
const vsi_nn_kernel_param_t * params, \
vsi_nn_kernel_t * kernel \
)
REGISTER_PAD2_OPENVX_KERNEL( pad2 )
{
vx_node node = NULL;
vx_nn_pad_params_t param;
size_t dim_num = 0;
int32_t* front_size = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "front_size", &dim_num);
int32_t* back_size = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "back_size", &dim_num);
int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "pad_mode");
int32_t pad_front_array[VSI_NN_MAX_DIM_NUM] = {0};
int32_t pad_back_array[VSI_NN_MAX_DIM_NUM] = {0};
vsi_nn_tensor_t *convert_tensor = NULL;
float const_val = vsi_nn_kernel_param_get_float32(params, "const_val");
memset(&param, 0, sizeof(param));
memset(pad_front_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
memset(pad_back_array, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
memcpy(pad_front_array, front_size, sizeof(int32_t) * dim_num);
memcpy(pad_back_array, back_size, sizeof(int32_t) * dim_num);
param.pad_mode = pad_mode;
param.pad_const = vxCreateScalar( graph->ctx->c, VX_TYPE_FLOAT32, &const_val );
param.numViewDimensions = (uint8_t)vsi_nn_max(dim_num, 2);
param.pad_front_array = pad_front_array;
param.pad_back_array = pad_back_array;
if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
{
vsi_nn_tensor_attr_t attr;
memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
attr.vtl = FALSE;
attr.is_const = FALSE;
convert_tensor = vsi_nn_CreateTensor(graph, &attr);
node = vxTensorCopyNode(
graph->g,
inputs[0]->t,
convert_tensor->t
);
}
else
{
convert_tensor = vsi_nn_reshape_tensor( graph,
inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
}
node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, &param, sizeof(param) );
vxReleaseScalar( &param.pad_const );
vsi_safe_release_tensor(convert_tensor);
return (vsi_nn_kernel_node_t)node;
} /* pad2() */
#undef REGISTER_PAD2_OPENVX_KERNEL

View File

@ -0,0 +1,37 @@
#pragma OPENCL EXTENSION CL_VIV_asm : enable
__kernel void clip_BF16toBF16(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float minData,
float maxData)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
uint4 src0 = read_imageui(input, coord);
src0 = src0 << 16;
float4 src;
_viv_asm(COPY, src, src0, 16);
float4 dst0 = clamp(src, minData, maxData);
uint4 dst;
_viv_asm(COPY, dst, dst0, 16);
dst = dst >> 16;
write_imageui(output, coord, dst);
}
__kernel void clip_BF16toBF16_2D(
__read_only image2d_t input,
__write_only image2d_t output,
float minData,
float maxData)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
uint4 src0 = read_imageui(input, coord);
src0 = src0 << 16;
float4 src;
_viv_asm(COPY, src, src0, 16);
float4 dst0 = clamp(src, minData, maxData);
uint4 dst;
_viv_asm(COPY, dst, dst0, 16);
dst = dst >> 16;
write_imageui(output, coord, dst);
}

View File

@ -0,0 +1,17 @@
__kernel void depth2space_crd_F32toF32(
image2d_array_t input, image2d_array_t output, int block_size)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int4 coord_out = (int4)(gidx, gidy, gidz, 0);
int block_e2 = block_size * block_size;
ushort blk = (ushort)block_size;
int inx = (int)((ushort)gidx / blk);
int iny = (int)((ushort)gidy / blk);
int inz = (gidx % block_size) + (gidy % block_size) * block_size + gidz * block_e2;
int4 coord_in = (int4)(inx, iny, inz, 0);
float4 data = read_imagef(input, coord_in);
write_imagef(output, coord_out, data);
}

View File

@ -3,6 +3,11 @@ float eltwise_unary_sin(float x, float alpha, float beta)
return native_sin(x);
}
float eltwise_unary_cos(float x, float alpha, float beta)
{
return native_cos(x);
}
#define logE (1.44269502f)
#define twoLogE (logE * 2.0f)
float eltwise_unary_exp(float x, float alpha, float beta)
@ -135,6 +140,7 @@ __kernel void func_name##_F32toF32 \
write_imagef(output, coord, dst.xxxx); \
}
ELTWISE_UNARY_F32(sin)
ELTWISE_UNARY_F32(cos)
ELTWISE_UNARY_F32(exp)
ELTWISE_UNARY_F32(log)
ELTWISE_UNARY_F32(elu)
@ -168,6 +174,7 @@ __kernel void func_name##_F32toF32_2D \
write_imagef(output, coord, dst.xxxx); \
}
ELTWISE_UNARY_F32_2D(sin)
ELTWISE_UNARY_F32_2D(cos)
ELTWISE_UNARY_F32_2D(exp)
ELTWISE_UNARY_F32_2D(log)
ELTWISE_UNARY_F32_2D(elu)
@ -202,6 +209,7 @@ __kernel void func_name##_U8toU8 \
write_imageui(output, coord, dst); \
}
ELTWISE_UNARY_U8(sin)
ELTWISE_UNARY_U8(cos)
ELTWISE_UNARY_U8(exp)
ELTWISE_UNARY_U8(log)
ELTWISE_UNARY_U8(elu)
@ -236,6 +244,7 @@ __kernel void func_name##_U8toU8_2D \
write_imageui(output, coord, dst); \
}
ELTWISE_UNARY_U8_2D(sin)
ELTWISE_UNARY_U8_2D(cos)
ELTWISE_UNARY_U8_2D(exp)
ELTWISE_UNARY_U8_2D(log)
ELTWISE_UNARY_U8_2D(elu)

View File

@ -1,7 +1,15 @@
__kernel void floordiv_F32F32toF32(
__kernel void floordiv_F32F32toF32
(
__read_only image2d_array_t input,
__read_only image2d_array_t input1,
__write_only image2d_array_t output)
__write_only image2d_array_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
float4 src0;
@ -12,10 +20,18 @@ __kernel void floordiv_F32F32toF32(
write_imagef(output, coord, dst);
}
__kernel void floordiv_F32F32toF32_2D(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output)
__kernel void floordiv_F32F32toF32_2D
(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
float4 src0 = read_imagef(input, coord);
@ -24,33 +40,8 @@ __kernel void floordiv_F32F32toF32_2D(
write_imagef(output, coord, dst);
}
__kernel void floordiv_I32I32toI32(
__read_only image2d_array_t input,
__read_only image2d_array_t input1,
__write_only image2d_array_t output)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 src0;
int4 src1;
READ_IMAGEI_2DARRAY(src0, input, coord);
READ_IMAGEI_2DARRAY(src1, input1, coord);
int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));
write_imagei(output, coord, dst);
}
__kernel void floordiv_I32I32toI32_2D(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 src0 = read_imagei(input, coord);
int4 src1 = read_imagei(input1, coord);
int4 dst = convert_int4(floor(convert_float4(src0) / convert_float4(src1)));
write_imagei(output, coord, dst);
}
__kernel void floordiv_I32I32toU8(
__kernel void floordiv_I32I32toI32
(
__read_only image2d_array_t input,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
@ -59,7 +50,56 @@ __kernel void floordiv_I32I32toU8(
float input1Scale,
float input1Tail,
float outputScale,
float outputTail )
float outputTail
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 src0;
int4 src1;
READ_IMAGEI_2DARRAY(src0, input, coord);
READ_IMAGEI_2DARRAY(src1, input1, coord);
float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
float4 out = floor(in0 / in1) * outputScale + outputTail;
int4 dst = convert_int4(out);
write_imagei(output, coord, dst);
}
__kernel void floordiv_I32I32toI32_2D
(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 src0 = read_imagei(input, coord);
int4 src1 = read_imagei(input1, coord);
float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
float4 out = floor(in0 / in1) * outputScale + outputTail;
int4 dst = convert_int4(out);
write_imagei(output, coord, dst);
}
__kernel void floordiv_I32I32toU8
(
__read_only image2d_array_t input,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 src0;
@ -73,16 +113,18 @@ __kernel void floordiv_I32I32toU8(
write_imageui(output, coord, dst);
}
__kernel void floordiv_I32I32toU8_2D(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail )
__kernel void floordiv_I32I32toU8_2D
(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 src0 = read_imagei(input, coord);
@ -94,7 +136,8 @@ __kernel void floordiv_I32I32toU8_2D(
write_imageui(output, coord, dst);
}
__kernel void floordiv_U8U8toU8(
__kernel void floordiv_U8U8toU8
(
__read_only image2d_array_t input,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
@ -103,7 +146,8 @@ __kernel void floordiv_U8U8toU8(
float input1Scale,
float input1Tail,
float outputScale,
float outputTail )
float outputTail
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
uint4 src0, src1;
@ -117,16 +161,18 @@ __kernel void floordiv_U8U8toU8(
write_imageui(output, coord, dst);
}
__kernel void floordiv_U8U8toU8_2D(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail )
__kernel void floordiv_U8U8toU8_2D
(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
uint4 src0 = read_imageui(input, coord);
@ -139,7 +185,8 @@ __kernel void floordiv_U8U8toU8_2D(
write_imageui(output, coord, dst);
}
__kernel void floordiv_U8I32toU8(
__kernel void floordiv_U8I32toU8
(
__read_only image2d_array_t input,
__read_only image2d_array_t input1,
__write_only image2d_array_t output,
@ -148,7 +195,8 @@ __kernel void floordiv_U8I32toU8(
float input1Scale,
float input1Tail,
float outputScale,
float outputTail )
float outputTail
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
uint4 src0;
@ -163,16 +211,18 @@ __kernel void floordiv_U8I32toU8(
write_imageui(output, coord, dst);
}
__kernel void floordiv_U8I32toU8_2D(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail )
__kernel void floordiv_U8I32toU8_2D
(
__read_only image2d_t input,
__read_only image2d_t input1,
__write_only image2d_t output,
float input0Scale,
float input0Tail,
float input1Scale,
float input1Tail,
float outputScale,
float outputTail
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
uint4 src0 = read_imageui(input, coord);

View File

@ -5,7 +5,8 @@ __kernel void gather_U8toU8(
int block_size,
int block_num,
int axis_num,
int indices_num
int indices_num,
int batch
)
{
int gidx = get_global_id(0); // block_size
@ -29,7 +30,8 @@ __kernel void gather_F16toF16(
int block_size,
int block_num,
int axis_num,
int indices_num
int indices_num,
int batch
)
{
int gidx = get_global_id(0); // block_size
@ -53,7 +55,8 @@ __kernel void gather_I32toI32(
int block_size,
int block_num,
int axis_num,
int indices_num
int indices_num,
int batch
)
{
int gidx = get_global_id(0); // block_size
@ -77,7 +80,8 @@ __kernel void gather_F32toF32(
int block_size,
int block_num,
int axis_num,
int indices_num
int indices_num,
int batch
)
{
int gidx = get_global_id(0); // block_size

View File

@ -0,0 +1,123 @@
__kernel void gather_batch_U8toU8(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num,
int indices_num,
int batch
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int gidz = get_global_id(2); // block_num
int2 coord_idx = (int2)(gidy, 0);
int4 coord_in = (int4)(gidx, 0, 0, 0);
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
for(; coord_idx.y < batch;)
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
coord_in.y = gidz * axis_num + indice.x;
uint4 data = read_imageui(input0, coord_in);
coord_in.z++;
write_imageui(output, coord, data);
coord.z++;
}
}
__kernel void gather_batch_F16toF16(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num,
int indices_num,
int batch
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int gidz = get_global_id(2); // block_num
int2 coord_idx = (int2)(gidy, 0);
int4 coord_in = (int4)(gidx, 0, 0, 0);
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
for(; coord_idx.y < batch;)
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
coord_in.y = gidz * axis_num + indice.x;
float4 data = read_imagef(input0, coord_in);
coord_in.z++;
write_imagef(output, coord, data);
coord.z++;
}
}
__kernel void gather_batch_I32toI32(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num,
int indices_num,
int batch
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int gidz = get_global_id(2); // block_num
int2 coord_idx = (int2)(gidy, 0);
int4 coord_in = (int4)(gidx, 0, 0, 0);
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
for(; coord_idx.y < batch;)
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
coord_in.y = gidz * axis_num + indice.x;
int4 data = read_imagei(input0, coord_in);
coord_in.z++;
write_imagei(output, coord, data);
coord.z++;
}
}
__kernel void gather_batch_F32toF32(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num,
int indices_num,
int batch
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int gidz = get_global_id(2); // block_num
int2 coord_idx = (int2)(gidy, 0);
int4 coord_in = (int4)(gidx, 0, 0, 0);
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
for(; coord_idx.y < batch;)
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
coord_in.y = gidz * axis_num + indice.x;
float4 data = read_imagef(input0, coord_in);
coord_in.z++;
write_imagef(output, coord, data);
coord.z++;
}
}

View File

@ -112,6 +112,48 @@ __kernel void moments_axis0_I32toF32(
vari.x = sqr * dimRatio * input_scale * input_scale;
vari.x = vari.x - mean.x * mean.x;
int2 coord_out = (int2)(gidy, gidz);
write_imagef(output_mean, coord_out, mean);
write_imagef(output_vari, coord_out, vari);
}
__kernel void moments_axis0_BF16toF32(
__read_only image2d_array_t input,
__write_only image2d_t output_mean,
__write_only image2d_t output_vari,
int axis,
int axis_num,
int input_zp,
float input_scale,
int width,
int height,
int chn,
float dimRatio
)
{
int gidy = get_global_id(0);
int gidz = get_global_id(1);
int4 coord0 = (int4)(0, gidy, gidz, 0);
float4 data;
float sum = 0, sqr = 0;
for(coord0.x = 0; coord0.x < width;)
{
uint4 src0 = read_imageui(input, coord0);
src0 = src0 << 16;
_viv_asm(COPY, data, src0, 16);
coord0.x++;
sum = sum + data.x;
sqr = sqr + data.x * data.x;
}
float4 mean, vari;
mean.x = sum * dimRatio;
vari.x = sqr * dimRatio;
vari.x = vari.x - mean.x * mean.x;
int2 coord_out = (int2)(gidy, gidz);
write_imagef(output_mean, coord_out, mean);
write_imagef(output_vari, coord_out, vari);

View File

@ -172,3 +172,63 @@ __kernel void moments_axis01_I32toF32(
write_imagef(output_vari, coord_out, vari);
}
}
__kernel void moments_axis01_BF16toF32(
image2d_array_t input, image2d_t output_mean, image2d_t output_vari,
int axis, int axis_num, int input_zp, float input_scale,
int width, int height, int chn, float dimRatio
)
{
int gidx = get_global_id(0);
int gidz = get_global_id(1);
int lidx = get_local_id(0);
int4 coord = (int4)(gidx, 0, gidz, 0);
float4 data;
float sum = 0, sqr = 0;
__local float lcl_sum[16];
__local float lcl_sqr[16];
for(coord.x = gidx; coord.x < width; coord.x += 16)
{
float tmpSum = 0, tmpSqr = 0;
for(coord.y = 0; coord.y < height;)
{
uint4 src0 = read_imageui(input, coord);
src0 = src0 << 16;
_viv_asm(COPY, data, src0, 16);
coord.y++;
tmpSum = tmpSum + data.x;
tmpSqr = tmpSqr + data.x * data.x;
}
sqr += tmpSqr;
sum += tmpSum;
}
lcl_sum[lidx] = sum;
lcl_sqr[lidx] = sqr;
barrier(CLK_LOCAL_MEM_FENCE);
int2 coord_out = (int2)(gidz, 0);
if(lidx == 0)
{
float4 one = (float4)(1, 1, 1, 1);
__local float4* tmp_sum = (__local float4*)lcl_sum;
__local float4* tmp_sqr = (__local float4*)lcl_sqr;
sum = 0; sqr = 0;
for(int i = 0; i < 4; i++)
{
sum += dot(tmp_sum[i], one);
sqr += dot(tmp_sqr[i], one);
}
float4 mean, vari;
mean.x = sum * dimRatio;
vari.x = sqr * dimRatio;
vari.x = vari.x - mean.x * mean.x;
write_imagef(output_mean, coord_out, mean);
write_imagef(output_vari, coord_out, vari);
}
}

View File

@ -177,3 +177,64 @@ __kernel void moments_axis012_I32toF32(
write_imagef(output_vari, coord_out, vari);
}
}
__kernel void moments_axis012_BF16toF32(
image2d_array_t input, image2d_t output_mean, image2d_t output_vari,
int axis, int axis_num, int input_zp, float input_scale,
int width, int height, int chn, float dimRatio
)
{
int gidx = get_global_id(0);
int lidx = get_local_id(0);
int4 coord = (int4)(gidx, 0, 0, 0);
float4 data;
float sum = 0, sqr = 0;
__local float lcl_sum[16];
__local float lcl_sqr[16];
for(coord.z = 0; coord.z < chn; coord.z++)
{
for(coord.x = gidx; coord.x < width; coord.x += 16)
{
float tmpSum = 0, tmpSqr = 0;
for(coord.y = 0; coord.y < height;)
{
uint4 src0 = read_imageui(input, coord);
src0 = src0 << 16;
_viv_asm(COPY, data, src0, 16);
coord.y++;
tmpSum = tmpSum + data.x;
tmpSqr = tmpSqr + data.x * data.x;
}
sqr += tmpSqr;
sum += tmpSum;
}
}
lcl_sum[lidx] = sum;
lcl_sqr[lidx] = sqr;
barrier(CLK_LOCAL_MEM_FENCE);
int2 coord_out = (int2)(0, 0);
if(lidx == 0)
{
float4 one = (float4)(1, 1, 1, 1);
__local float4* tmp_sum = (__local float4*)lcl_sum;
__local float4* tmp_sqr = (__local float4*)lcl_sqr;
sum = 0; sqr = 0;
for(int i = 0; i < 4; i++)
{
sum += dot(tmp_sum[i], one);
sqr += dot(tmp_sqr[i], one);
}
float4 mean, vari;
mean.x = sum * dimRatio;
vari.x = sqr * dimRatio;
vari.x = vari.x - mean.x * mean.x;
write_imagef(output_mean, coord_out, mean);
write_imagef(output_vari, coord_out, vari);
}
}

View File

@ -106,6 +106,47 @@ __kernel void moments_axis1_I32toF32(
vari.x = sqr * dimRatio * input_scale * input_scale;
vari.x = vari.x - mean.x * mean.x;
int2 coord_out = (int2)(gidx, gidz);
write_imagef(output_mean, coord_out, mean);
write_imagef(output_vari, coord_out, vari);
}
__kernel void moments_axis1_BF16toF32(
__read_only image2d_array_t input,
__write_only image2d_t output_mean,
__write_only image2d_t output_vari,
int axis,
int axis_num,
int input_zp,
float input_scale,
int width,
int height,
int chn,
float dimRatio
)
{
int gidx = get_global_id(0);
int gidz = get_global_id(1);
int4 coord0 = (int4)(gidx, 0, gidz, 0);
float4 data;
float sum = 0, sqr = 0;
for(coord0.y = 0; coord0.y < height;)
{
uint4 src0 = read_imageui(input, coord0);
src0 = src0 << 16;
_viv_asm(COPY, data, src0, 16);
coord0.y++;
sum = sum + data.x;
sqr = sqr + data.x * data.x;
}
float4 mean, vari;
mean.x = sum * dimRatio;
vari.x = sqr * dimRatio;
vari.x = vari.x - mean.x * mean.x;
int2 coord_out = (int2)(gidx, gidz);
write_imagef(output_mean, coord_out, mean);
write_imagef(output_vari, coord_out, vari);

View File

@ -123,4 +123,46 @@ __kernel void moments_axis2_I32toF32(
int2 coord_out = (int2)(gidx, gidy);
write_imagef(output_mean, coord_out, mean);
write_imagef(output_vari, coord_out, vari);
}
}
__kernel void moments_axis2_BF16toF32(
__read_only image2d_array_t input,
__write_only image2d_t output_mean,
__write_only image2d_t output_vari,
int axis,
int axis_num,
int input_zp,
float input_scale,
int width,
int height,
int chn,
float dimRatio
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int4 coord0 = (int4)(gidx, gidy, 0, 0);
float4 data;
float sum = 0, sqr = 0;
for(coord0.z = 0; coord0.z < chn;)
{
uint4 src0 = read_imageui(input, coord0);
src0 = src0 << 16;
_viv_asm(COPY, data, src0, 16);
coord0.z++;
sum = sum + data.x;
sqr = sqr + data.x * data.x;
}
float4 mean, vari;
mean.x = sum * dimRatio;
vari.x = sqr * dimRatio;
vari.x = vari.x - mean.x * mean.x;
int2 coord_out = (int2)(gidx, gidy);
write_imagef(output_mean, coord_out, mean);
write_imagef(output_vari, coord_out, vari);
}

View File

@ -0,0 +1,251 @@
#define TOPK_F32(LOCAL_SIZE0, STAGES) \
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toF32_I32 \
( \
__read_only image2d_t input, \
__write_only image2d_t output, \
__write_only image2d_t indices, \
int num_stages, \
int width \
) \
{ \
uint local_id = get_local_id(0); \
uint work_group_size = get_local_size(0); \
uint offset = 0; \
\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
__local float local_data[128]; \
__local uint local_indices[128]; \
\
float left = read_imagef(input, coord.xy).x; \
coord.z += work_group_size; \
float data = read_imagef(input, coord.zy).x; \
float right = coord.z < width ? data : -2147483647.0f; \
\
local_data[local_id] = left; \
local_indices[local_id] = local_id; \
local_data[local_id + work_group_size] = right; \
local_indices[local_id + work_group_size] = local_id + work_group_size; \
\
barrier(CLK_LOCAL_MEM_FENCE); \
\
for (uint stage = 0; stage < num_stages + 1; ++stage) \
{ \
uint signo = (local_id >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
uint left_idx = local_indices[left_id]; \
uint right_idx = local_indices[right_id]; \
\
float left_elem = local_data[left_id]; \
float right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
\
local_indices[left_id] = right_idx; \
local_indices[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
} \
\
float4 dst; \
dst.x = local_data[local_id]; \
dst.y = local_data[local_id + work_group_size]; \
\
write_imagef(output, coord.xy, dst.xxxx); \
write_imagef(output, coord.zy, dst.yyyy); \
\
int4 index; \
index.x = ((int*)local_indices)[local_id]; \
index.y = ((int*)local_indices)[local_id + work_group_size]; \
\
write_imagei(indices, coord.xy, index.xxxx); \
write_imagei(indices, coord.zy, index.yyyy); \
}
TOPK_F32(1 << 0, 0)
TOPK_F32(1 << 1, 1)
TOPK_F32(1 << 2, 2)
TOPK_F32(1 << 3, 3)
TOPK_F32(1 << 4, 4)
TOPK_F32(1 << 5, 5)
TOPK_F32(1 << 6, 6)
#define TOPK_U32(LOCAL_SIZE0, STAGES) \
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \
( \
__read_only image2d_t input, \
__write_only image2d_t output, \
__write_only image2d_t indices, \
int num_stages, \
int width \
) \
{ \
uint local_id = get_local_id(0); \
uint work_group_size = get_local_size(0); \
uint offset = 0; \
\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
__local uint local_data[128]; \
__local uint local_indices[128]; \
\
uint left = read_imageui(input, coord.xy).x; \
coord.z += work_group_size; \
uint data = read_imageui(input, coord.zy).x; \
uint right = coord.z < width ? data : 0; \
\
local_data[local_id] = left; \
local_indices[local_id] = local_id; \
local_data[local_id + work_group_size] = right; \
local_indices[local_id + work_group_size] = local_id + work_group_size; \
\
barrier(CLK_LOCAL_MEM_FENCE); \
\
for (uint stage = 0; stage < num_stages + 1; ++stage) \
{ \
uint signo = (local_id >> stage) & 1; \
\
for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
uint postShift = (stage - passOfStage); \
uint pairDistance = 1 << postShift; \
\
uint left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
uint right_id = left_id + pairDistance; \
\
uint left_idx = local_indices[left_id]; \
uint right_idx = local_indices[right_id]; \
\
uint left_elem = local_data[left_id]; \
uint right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
\
local_indices[left_id] = right_idx; \
local_indices[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
} \
\
uint4 dst; \
dst.x = local_data[local_id]; \
dst.y = local_data[local_id + work_group_size]; \
\
write_imageui(output, coord.xy, dst.xxxx); \
write_imageui(output, coord.zy, dst.yyyy); \
\
int4 index; \
index.x = ((int*)local_indices)[local_id]; \
index.y = ((int*)local_indices)[local_id + work_group_size]; \
\
write_imagei(indices, coord.xy, index.xxxx); \
write_imagei(indices, coord.zy, index.yyyy); \
}
TOPK_U32(1 << 0, 0)
TOPK_U32(1 << 1, 1)
TOPK_U32(1 << 2, 2)
TOPK_U32(1 << 3, 3)
TOPK_U32(1 << 4, 4)
TOPK_U32(1 << 5, 5)
TOPK_U32(1 << 6, 6)
#define TOPK_I32(LOCAL_SIZE0, STAGES) \
__kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \
( \
__read_only image2d_t input, \
__write_only image2d_t output, \
__write_only image2d_t indices, \
int num_stages, \
int width \
) \
{ \
int local_id = get_local_id(0); \
int work_group_size = get_local_size(0); \
int offset = 0; \
\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
\
__local int local_data[128]; \
__local int local_indices[128]; \
\
int left = read_imagei(input, coord.xy).x; \
coord.z += work_group_size; \
int data = read_imagei(input, coord.zy).x; \
int right = coord.z < width ? data : -2147483647; \
\
local_data[local_id] = left; \
local_indices[local_id] = local_id; \
local_data[local_id + work_group_size] = right; \
local_indices[local_id + work_group_size] = local_id + work_group_size; \
\
barrier(CLK_LOCAL_MEM_FENCE); \
\
for (int stage = 0; stage < num_stages + 1; ++stage) \
{ \
int signo = (local_id >> stage) & 1; \
\
for (int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
{ \
int postShift = (stage - passOfStage); \
int pairDistance = 1 << postShift; \
\
int left_id = ( (local_id >> postShift) << (postShift + 1)) + (local_id & (pairDistance - 1)); \
int right_id = left_id + pairDistance; \
\
int left_idx = local_indices[left_id]; \
int right_idx = local_indices[right_id]; \
\
int left_elem = local_data[left_id]; \
int right_elem = local_data[right_id]; \
\
if ((left_elem < right_elem) ^ signo) \
{ \
local_data[left_id] = right_elem; \
local_data[right_id] = left_elem; \
\
local_indices[left_id] = right_idx; \
local_indices[right_id] = left_idx; \
} \
\
barrier(CLK_LOCAL_MEM_FENCE); \
} \
} \
\
int4 dst; \
dst.x = local_data[local_id]; \
dst.y = local_data[local_id + work_group_size]; \
\
write_imagei(output, coord.xy, dst.xxxx); \
write_imagei(output, coord.zy, dst.yyyy); \
\
int4 index; \
index.x = ((int*)local_indices)[local_id]; \
index.y = ((int*)local_indices)[local_id + work_group_size]; \
\
write_imagei(indices, coord.xy, index.xxxx); \
write_imagei(indices, coord.zy, index.yyyy); \
}
TOPK_I32(1 << 0, 0)
TOPK_I32(1 << 1, 1)
TOPK_I32(1 << 2, 2)
TOPK_I32(1 << 3, 3)
TOPK_I32(1 << 4, 4)
TOPK_I32(1 << 5, 5)
TOPK_I32(1 << 6, 6)

View File

@ -3,6 +3,8 @@
_viv_uniform int4 packedArgIdx;
_viv_uniform int argLenSub1;
_viv_uniform VXC_512Bits uniExtractData_2x8;
_viv_uniform VXC_512Bits uniExtract1stU8toI16_2x8;
_viv_uniform VXC_512Bits uniExtract2ndU8toI16_2x8;
#define TENSOR_ARGMAX_AXIS2_16BITS(src_type_name, dst_type_name,\
src_type, copy_type, axis_type, dst_type, inst_type) \
@ -67,6 +69,56 @@ TENSOR_ARGMAX_AXIS2_16BITS_2D(I16, U8, vxc_short8, vxc_short8, vxc_short8, vxc_
#define TENSOR_ARGMAX_AXIS2_8BITS(src_type_name, dst_type_name, src_type, dst_type) \
__kernel void argmax_axis2_##src_type_name##to##dst_type_name( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axisVal \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \
src_type src; \
src_type maxVal; \
VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
dst_type axis; \
dst_type packIdx; \
\
_viv_asm(COPY, axis, packedArgIdx, 16); \
_viv_asm(COPY, packIdx, packedArgIdx, 16); \
\
coord.z --; \
do \
{ \
VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
coord.z --; \
packIdx --; \
maxVal = max(maxVal, src); \
src_type condition; \
VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \
axis = condition ? packIdx : axis; \
} while (coord.z >= 0); \
\
VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
}
TENSOR_ARGMAX_AXIS2_8BITS(I8, U8, vxc_char16, vxc_uchar16)
TENSOR_ARGMAX_AXIS2_8BITS(U8, U8, vxc_uchar16, vxc_uchar16)
#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \
__kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axisVal \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8)
TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8)
TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8)
TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8)
#define TENSOR_ARGMAX_AXIS2_MIX(src_type_name, dst_type_name, src_type, dst_type) \
__kernel void argmax_axis2_##src_type_name##to##dst_type_name( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axisVal \
) \
@ -95,23 +147,46 @@ __write_only image2d_array_t output, \
\
VXC_WriteImage(output, coord.xy, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
TENSOR_ARGMAX_AXIS2_8BITS(I8, I16, vxc_char8, vxc_short8)
TENSOR_ARGMAX_AXIS2_8BITS(I8, U8, vxc_char8, vxc_uchar8)
TENSOR_ARGMAX_AXIS2_8BITS(U8, I16, vxc_uchar8, vxc_short8)
TENSOR_ARGMAX_AXIS2_8BITS(U8, U8, vxc_uchar8, vxc_uchar8)
TENSOR_ARGMAX_AXIS2_MIX(I8, I16, vxc_char8, vxc_short8)
TENSOR_ARGMAX_AXIS2_MIX(U8, I16, vxc_uchar8, vxc_short8)
#define TENSOR_ARGMAX_AXIS2_8BITS_2D(src_type_name, dst_type_name, src_type, dst_type) \
__kernel void argmax_axis2_##src_type_name##to##dst_type_name##_2D( \
#define TENSOR_ARGMAX_AXIS2_MIX_OPT(src_type_name, dst_type_name, src_type, dst_type) \
__kernel void argmax_axis2_##src_type_name##to##dst_type_name##_opt( \
__read_only image2d_array_t input, \
__write_only image2d_array_t output, \
int axisVal \
) \
{ \
int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
dst_type axis = (dst_type)(0, 0, 0, 0, 0, 0, 0, 0); \
VXC_WriteImage(output, coord, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
int4 coord = (int4)(get_global_id(0), get_global_id(1), argLenSub1, 0); \
src_type src; \
src_type maxVal; \
VXC_ReadImage2DArray(maxVal, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
vxc_uchar16 axis; \
vxc_uchar16 packIdx; \
\
_viv_asm(COPY, axis, packedArgIdx, 16); \
_viv_asm(COPY, packIdx, packedArgIdx, 16); \
\
coord.z --; \
do \
{ \
VXC_ReadImage2DArray(src, input, coord.xyzw, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
coord.z --; \
packIdx --; \
maxVal = max(maxVal, src); \
src_type condition; \
VXC_Clamp(condition, src, maxVal, maxVal, VXC_MODIFIER_CLAMP(0, 15, 0, 1)); \
axis = condition ? packIdx : axis; \
} while (coord.z >= 0); \
vxc_short8 dst0, dst1; \
VXC_DP2x8(dst0, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniExtract1stU8toI16_2x8); \
VXC_DP2x8(dst1, axis, axis, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
uniExtract2ndU8toI16_2x8); \
\
VXC_WriteImage(output, coord.xy, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
coord.x += 8; \
VXC_WriteImage(output, coord.xy, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
}
TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, I16, vxc_char8, vxc_short8)
TENSOR_ARGMAX_AXIS2_8BITS_2D(I8, U8, vxc_char8, vxc_uchar8)
TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, I16, vxc_uchar8, vxc_short8)
TENSOR_ARGMAX_AXIS2_8BITS_2D(U8, U8, vxc_uchar8, vxc_uchar8)
TENSOR_ARGMAX_AXIS2_MIX_OPT(I8, I16, vxc_char16, vxc_short8)
TENSOR_ARGMAX_AXIS2_MIX_OPT(U8, I16, vxc_uchar16, vxc_short8)

View File

@ -19,14 +19,13 @@ __kernel void Softmax2VXC
int axis
)
{
int4 coord_in = (int4)(0,0,0,0);
float fMax = 0.0;
for (int i = 0; i < sf_size; i++)
{
vxc_char8 val;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
@ -40,7 +39,7 @@ __kernel void Softmax2VXC
vxc_char8 val;
coord_in.x = i;
VXC_ReadImage2DArray(val, input, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
float fval;
VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
@ -57,7 +56,7 @@ __kernel void Softmax2VXC
vxc_short8 val;
vxc_half8 val_h;
coord_in.x = i;
VXC_ReadImage2DArray(val, output, coord_in, VXC_5BITOFFSET_XY(0,0), VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));
float fval;
_viv_asm(COPY, val_h,val, 16);
VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);
@ -68,8 +67,4 @@ __kernel void Softmax2VXC
_viv_asm(COPY,dst,hVal, 4);
VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -0,0 +1,353 @@
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
#include "cl_viv_vx_ext.h"
_viv_uniform float4 matrix0;
_viv_uniform float2 matrix1;
_viv_uniform float4 matrix4;
__kernel void custom_warp_affine_nearest_neighbor_U8toU8_2D
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
vxc_uchar16 dst;
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_affine_bilinear_U8toU8_2D
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
vxc_uchar16 src0, src1, dst;
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_affine_nearest_neighbor_U8toU8
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_input.w, baseAddr);
vxc_uchar16 dst;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_affine_bilinear_U8toU8
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f = convert_float4(coord_in);
coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;
coord_in = convert_int4(coord_f);
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_input.w, baseAddr);
vxc_uchar16 src0, src1, dst;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
coord_f = coord_f.zwzw + matrix4;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,395 @@
#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
#include "cl_viv_vx_ext.h"
_viv_uniform float4 matrix0;
_viv_uniform float4 matrix1;
_viv_uniform float4 matrix2;
_viv_uniform float4 matrix4;
__kernel void custom_warp_perspective_nearest_neighbor_U8toU8_2D
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5,
float _m6,
float _m7,
float _m8
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f0 = convert_float4(coord_in);
float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
z0.zw = z0.zw + 2 * matrix1.z;
float4 z1 = z0 + 4 * matrix1.z;
z0 = 1.0f / z0;
z1 = 1.0f / z1;
coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
float4 coord_f = coord_f0 * z0.xxyy;
coord_in = convert_int4(coord_f);
vxc_uchar16 dst;
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z0.zzww;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z1.xxyy;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z1.zzww;
coord_in = convert_int4(coord_f);
VXC_ReadImage(dst, input, coord_in.xy, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(dst, input, coord_in.zw, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_perspective_bilinear_U8toU8_2D
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5,
float _m6,
float _m7,
float _m8
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f0 = convert_float4(coord_in);
float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
z0.zw = z0.zw + 2 * matrix1.z;
float4 z1 = z0 + 4 * matrix1.z;
z0 = 1.0f / z0;
z1 = 1.0f / z1;
coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
float4 coord_f = coord_f0 * z0.xxyy;
coord_in = convert_int4(floor(coord_f));
vxc_uchar16 src0, src1, dst;
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z0.zzww;
coord_in = convert_int4(floor(coord_f));
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z1.xxyy;
coord_in = convert_int4(floor(coord_f));
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z1.zzww;
coord_in = convert_int4(floor(coord_f));
VXC_ReadImage(src0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#endif
VXC_ReadImage(src0, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input, coord_in.zw, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
#define IMAGE_LOAD_3D(dst, xoffset, yoffset, start, end) \
VXC_OP4(img_load_3d, dst, input, coord_input.xywz, VXC_5BITOFFSET_XY(xoffset, yoffset), \
VXC_MODIFIER(start, end, 0, VXC_RM_TowardZero, 0));
__kernel void custom_warp_perspective_nearest_neighbor_U8toU8
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5,
float _m6,
float _m7,
float _m8
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f0 = convert_float4(coord_in);
float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
z0.zw = z0.zw + 2 * matrix1.z;
float4 z1 = z0 + 4 * matrix1.z;
z0 = 1.0f / z0;
z1 = 1.0f / z1;
coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
float4 coord_f = coord_f0 * z0.xxyy;
coord_in = convert_int4(coord_f);
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_input.w, baseAddr);
vxc_uchar16 dst;
IMAGE_LOAD_3D(dst, 0, 0, 0, 0)
coord_input.xy = coord_in.zw;
IMAGE_LOAD_3D(dst, 0, 0, 1, 1)
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z0.zzww;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
IMAGE_LOAD_3D(dst, 0, 0, 2, 2)
coord_input.xy = coord_in.zw;
IMAGE_LOAD_3D(dst, 0, 0, 3, 3)
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z1.xxyy;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
IMAGE_LOAD_3D(dst, 0, 0, 4, 4)
coord_input.xy = coord_in.zw;
IMAGE_LOAD_3D(dst, 0, 0, 5, 5)
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z1.zzww;
coord_in = convert_int4(coord_f);
coord_input.xy = coord_in.xy;
IMAGE_LOAD_3D(dst, 0, 0, 6, 6)
coord_input.xy = coord_in.zw;
IMAGE_LOAD_3D(dst, 0, 0, 7, 7)
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
__kernel void custom_warp_perspective_bilinear_U8toU8
(
__read_only image2d_array_t input,
__write_only image2d_array_t output,
float _m0,
float _m1,
float _m2,
float _m3,
float _m4,
float _m5,
float _m6,
float _m7,
float _m8
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));
float4 coord_f0 = convert_float4(coord_in);
float4 z0 = coord_f0.xzxz * matrix1.zzzz + coord_f0.y * matrix1.wwww + matrix2.xxxx;
z0.zw = z0.zw + 2 * matrix1.z;
float4 z1 = z0 + 4 * matrix1.z;
z0 = 1.0f / z0;
z1 = 1.0f / z1;
coord_f0 = coord_f0.xxzz * matrix0.xyxy + coord_f0.yyww * matrix0.zwzw + matrix1.xyxy;
float4 coord_f = coord_f0 * z0.xxyy;
coord_in = convert_int4(floor(coord_f));
int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));
int8 input_desc;
_viv_asm(COPY, input_desc, input, sizeof(input_desc));
int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;
_viv_asm(MOV, coord_input.w, baseAddr);
vxc_uchar16 src0, src1, dst;
IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
#endif
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z0.zzww;
coord_in = convert_int4(floor(coord_f));
coord_input.xy = coord_in.xy;
IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
#endif
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z1.xxyy;
coord_in = convert_int4(floor(coord_f));
coord_input.xy = coord_in.xy;
IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
#endif
coord_f0 = coord_f0.zwzw + matrix4;
coord_f = coord_f0 * z1.zzww;
coord_in = convert_int4(floor(coord_f));
coord_input.xy = coord_in.xy;
IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.x, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
#endif
coord_input.xy = coord_in.zw;
IMAGE_LOAD_3D(src0, 0, 0, 0, 1)
IMAGE_LOAD_3D(src1, 0, 1, 0, 1)
#if (VX_VERSION==1)
VXC_BiLinear(dst, src0, src1, coord_f.zw, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#else
VXC_Lerp(src0, src0, src1, coord_f.w, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
src1.s0 = src0.s1;
VXC_Lerp(dst, src0, src1, coord_f.z, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
#endif
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}

View File

@ -304,4 +304,4 @@ __kernel void depth2space_crd_F16toI16_blk2(
VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_out.x += 8;
VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -8,6 +8,11 @@ float4 eltwise_unary_sin(float4 x)
return native_sin(x);
}
float4 eltwise_unary_cos(float4 x)
{
return native_cos(x);
}
#define logE (1.44269502f)
#define twoLogE (logE * 2.0f)
float4 eltwise_unary_exp(float4 x)
@ -189,6 +194,17 @@ ELTSISE_UNARY_2D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_u
ELTSISE_UNARY_2D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_2D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
ELTSISE_UNARY_2D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
//COS
ELTSISE_UNARY_2D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_2D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)
ELTSISE_UNARY_2D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)
ELTSISE_UNARY_2D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)
ELTSISE_UNARY_2D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)
ELTSISE_UNARY_2D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_2D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)
ELTSISE_UNARY_2D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_2D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
ELTSISE_UNARY_2D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
//LOG
ELTSISE_UNARY_2D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_2D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)
@ -315,6 +331,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
ELTSISE_UNARY_BF16_2D(exp)
//SIN
ELTSISE_UNARY_BF16_2D(sin)
//COS
ELTSISE_UNARY_BF16_2D(cos)
//LOG
ELTSISE_UNARY_BF16_2D(log)
//ELU

View File

@ -8,6 +8,11 @@ float4 eltwise_unary_sin(float4 x)
return native_sin(x);
}
float4 eltwise_unary_cos(float4 x)
{
return native_cos(x);
}
#define logE (1.44269502f)
#define twoLogE (logE * 2.0f)
float4 eltwise_unary_exp(float4 x)
@ -189,6 +194,17 @@ ELTSISE_UNARY_3D(sin, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_u
ELTSISE_UNARY_3D(sin, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_3D(sin, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
ELTSISE_UNARY_3D(sin, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
//COS
ELTSISE_UNARY_3D(cos, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_3D(cos, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)
ELTSISE_UNARY_3D(cos, F16, U8, vxc_short8, vxc_half8, int4, vxc_uchar8, vxc_uchar8)
ELTSISE_UNARY_3D(cos, F16, I16, vxc_short8, vxc_half8, int4, vxc_short8, vxc_short8)
ELTSISE_UNARY_3D(cos, I8, I8, vxc_char8, vxc_char8, int4, vxc_char8, vxc_char8)
ELTSISE_UNARY_3D(cos, I8, F16, vxc_char8, vxc_char8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_3D(cos, U8, U8, vxc_uchar8, vxc_uchar8, int4, vxc_uchar8, vxc_uchar8)
ELTSISE_UNARY_3D(cos, U8, F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_3D(cos, I16, I16, vxc_short8, vxc_short8, int4, vxc_short8, vxc_short8)
ELTSISE_UNARY_3D(cos, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8, vxc_short8)
//LOG
ELTSISE_UNARY_3D(log, F16, F16, vxc_short8, vxc_half8, half4, vxc_half8, vxc_short8)
ELTSISE_UNARY_3D(log, F16, I8, vxc_short8, vxc_half8, int4, vxc_char8, vxc_char8)
@ -314,6 +330,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
ELTSISE_UNARY_BF16(exp)
//SIN
ELTSISE_UNARY_BF16(sin)
//COS
ELTSISE_UNARY_BF16(cos)
//LOG
ELTSISE_UNARY_BF16(log)
//ELU

View File

@ -91,8 +91,6 @@ __kernel void gather_F16toF16(
int gidz = get_global_id(2); // block_num
int4 coord_in = (int4)(gidy, 0, gidx, 0);
int4 indice = read_imagei(input1, coord_in.xy);
coord_in.w = gidz * axis_num + indice.x;

View File

@ -0,0 +1,237 @@
#include "cl_viv_vx_ext.h"
_viv_uniform int indices_num;
_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;
_viv_uniform int batch;
__kernel void gather_batch_I8toI8(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int gidz = get_global_id(2); // block_num
int2 coord_idx = (int2)(gidy, 0);
int4 coord_in = (int4)(gidx, 0, 0, 0);
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
for(; coord_idx.y < batch;)
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
coord_in.y = gidz * axis_num + indice.x;
vxc_char16 src;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.z++;
VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord.z++;
}
}
__kernel void gather_batch_U8toU8(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int gidz = get_global_id(2); // block_num
int2 coord_idx = (int2)(gidy, 0);
int4 coord_in = (int4)(gidx, 0, 0, 0);
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
for(; coord_idx.y < batch;)
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
coord_in.y = gidz * axis_num + indice.x;
vxc_uchar16 src;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord_in.z++;
VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
coord.z++;
}
}
__kernel void gather_batch_I16toI16(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int gidz = get_global_id(2); // block_num
int2 coord_idx = (int2)(gidy, 0);
int4 coord_in = (int4)(gidx, 0, 0, 0);
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
for(; coord_idx.y < batch;)
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
coord_in.y = gidz * axis_num + indice.x;
vxc_short8 src;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.z++;
VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.z++;
}
}
__kernel void gather_batch_F16toF16(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num
)
{
int gidx = get_global_id(0); // block_size
int gidy = get_global_id(1); // indices_num
int gidz = get_global_id(2); // block_num
int2 coord_idx = (int2)(gidy, 0);
int4 coord_in = (int4)(gidx, 0, 0, 0);
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
for(; coord_idx.y < batch;)
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
coord_in.y = gidz * axis_num + indice.x;
vxc_short8 src;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.z++;
VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.z++;
}
}
__kernel void gather_batch_I8toI8_axis0(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_char16 src, dst;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.y;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.z;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.w;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtraCopyDpKeepinEvis_2x8);
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_batch_U8toU8_axis0(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_uchar16 src, dst;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.y;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.z;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.w;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtraCopyDpKeepinEvis_2x8);
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_batch_I16toI16_axis0(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.y;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.z;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.w;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtraCopyDpKeepinEvis_2x8);
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void gather_batch_F16toF16_axis0(
__read_only image2d_array_t input0,
__read_only image2d_t input1,
__write_only image2d_array_t output,
int block_size,
int block_num,
int axis_num
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.y;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.z;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.w;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniExtraCopyDpKeepinEvis_2x8);
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}

View File

@ -0,0 +1,236 @@
#include "cl_viv_vx_ext.h"
_viv_uniform int indices_num;
_viv_uniform int batch;
_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
_viv_uniform VXC_512Bits uniConvertFp16toU8_2x8;
_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
#define GATHER_BATCH_8BITS_TO_F16(src0_type_name, read_type) \
__kernel void gather_batch_##src0_type_name##toF16( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int block_num, \
int axis_num \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
\
int2 coord_idx = (int2)(gidy, 0); \
int4 coord_in = (int4)(gidx, 0, 0, 0); \
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
\
for(; coord_idx.y < batch;) \
{ \
int4 indice = read_imagei(input1, coord_idx); \
coord_idx.y++; \
coord_in.y = gidz * axis_num + indice.x; \
\
read_type src; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
coord_in.z++; \
vxc_half8 src0, src1; \
vxc_short8 dst0, dst1; \
\
VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
VXC_DP2x8(src1,src,ms0, VXC_MODIFIER(0,7,8, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
_viv_asm(COPY, dst0, src0, 16); \
_viv_asm(COPY, dst1, src1, 16); \
VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
coord.x += 8; \
VXC_WriteImage2DArray(output, coord, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
coord.z++; \
coord.x = gidx; \
} \
}
GATHER_BATCH_8BITS_TO_F16(U8, vxc_uchar16)
GATHER_BATCH_8BITS_TO_F16(I8, vxc_char16)
#define GATHER_BATCH_F16_TO_QINT(src1_type_name, write_type) \
__kernel void gather_batch_F16to##src1_type_name( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int block_num, \
int axis_num \
) \
{ \
int gidx = get_global_id(0); \
int gidy = get_global_id(1); \
int gidz = get_global_id(2); \
\
int2 coord_idx = (int2)(gidy, 0); \
int4 coord_in = (int4)(gidx, 0, 0, 0); \
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0); \
vxc_ushort8 mp1; \
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
for(; coord_idx.y < batch;) \
{ \
int4 indice = read_imagei(input1, coord_idx); \
coord_idx.y++; \
coord_in.y = gidz * axis_num + indice.x; \
\
vxc_short8 src; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
coord_in.z++; \
\
vxc_half8 data; \
write_type dst; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
coord.z++; \
} \
}
GATHER_BATCH_F16_TO_QINT(U8, vxc_uchar16)
GATHER_BATCH_F16_TO_QINT(I8, vxc_char16)
GATHER_BATCH_F16_TO_QINT(I16, vxc_short8)
__kernel void gather_batch_I16toF16(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
int axis_num
)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int gidz = get_global_id(2);
int2 coord_idx = (int2)(gidy, 0);
int4 coord_in = (int4)(gidx, 0, 0, 0);
int4 coord = (int4)(gidx, gidz * indices_num + gidy, 0, 0);
vxc_half8 src0;
vxc_short8 dst0;
vxc_ushort8 ms0;
_viv_asm(COPY, ms0, multAndoutZP0, 16);
for(; coord_idx.y < batch;)
{
int4 indice = read_imagei(input1, coord_idx);
coord_idx.y++;
coord_in.y = gidz * axis_num + indice.x;
vxc_short8 src;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord_in.z++;
VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniU8MulAndPostShift_0_Lo_2x8);
_viv_asm(COPY, dst0, src0, 16);
VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.z++;
}
}
#define GATHER_BATCH_8BITS_TO_F16_AXIS0(src0_type_name, read_type) \
__kernel void gather_batch_##src0_type_name##toF16_axis0( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int block_num, \
int axis_num \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
int4 indices = read_imagei(input1, coord.xz); \
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
\
read_type src; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = indices.y; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = indices.z; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = indices.w; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
\
vxc_half8 src0; \
vxc_short8 dst0; \
vxc_ushort8 ms0; \
_viv_asm(COPY, ms0, multAndoutZP0, 16); \
VXC_DP2x8(src0,src,ms0, VXC_MODIFIER(0,7,0, VXC_RM_TowardZero,1), uniU8MulAndPostShift_0_Lo_2x8); \
_viv_asm(COPY, dst0, src0, 16); \
VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
GATHER_BATCH_8BITS_TO_F16_AXIS0(U8, vxc_uchar16)
GATHER_BATCH_8BITS_TO_F16_AXIS0(I8, vxc_char16)
#define GATHER_BATCH_F16_TO_QINT_AXIS0(src1_type_name, write_type) \
__kernel void gather_batch_F16to##src1_type_name##_axis0( \
__read_only image2d_t input0, \
__read_only image2d_t input1, \
__write_only image2d_t output, \
int block_size, \
int block_num, \
int axis_num \
) \
{ \
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
int4 indices = read_imagei(input1, coord.xz); \
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0); \
\
vxc_short8 src; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = indices.y; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = indices.z; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
coord_in.x = indices.w; \
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
\
vxc_ushort8 mp1; \
_viv_asm(COPY, mp1, multAndoutZP1, 16); \
vxc_half8 data; \
write_type dst; \
_viv_asm(COPY, data, src, 16); \
VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
}
GATHER_BATCH_F16_TO_QINT_AXIS0(U8, vxc_uchar16)
GATHER_BATCH_F16_TO_QINT_AXIS0(I8, vxc_char16)
GATHER_BATCH_F16_TO_QINT_AXIS0(I16, vxc_short8)
__kernel void gather_batch_I16toF16_axis0(
__read_only image2d_t input0,
__read_only image2d_t input1,
__write_only image2d_t output,
int block_size,
int block_num,
int axis_num
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 indices = read_imagei(input1, coord.xz);
int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);
vxc_short8 src, dst;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.y;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.z;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
coord_in.x = indices.w;
VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
vxc_half8 src0;
vxc_short8 dst0;
vxc_ushort8 ms0;
_viv_asm(COPY, ms0, multAndoutZP0, 16);
VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
uniU8MulAndPostShift_0_Lo_2x8);
_viv_asm(COPY, dst0, src0, 16);
VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}

View File

@ -1,5 +1,7 @@
#include "cl_viv_vx_ext.h"
_viv_uniform VXC_512Bits uniConvertInt16toInt8_2x8;
#define TENSORLOGICAL_PROCESS(input_type, copy_type, output_type, out_copy_type,\
lgc_op, lgc_op2, read_fun, write_fun) \
input_type vA;\
@ -59,7 +61,7 @@ out_copy_type, lgc_op, lgc_op2, read_fun, write_fun) \
VXC_DP2x8(tmpOut,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 0),uniMulShortMinus1toFp16_2x8); \
out_copy_type data; \
_viv_asm(COPY, data, tmpOut, 16); \
write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
#define TENSORLOGICAL_FP(name0, src_type_name, dst_type_name, input_type,\
@ -86,6 +88,47 @@ copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \
VXC_ReadImage, VXC_WriteImage) \
}
#define TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type,\
out_copy_type, lgc_op, lgc_op2, read_fun, write_fun) \
input_type vA;\
copy_type src0;\
input_type vB;\
copy_type src1;\
read_fun(vA,in0,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\
_viv_asm(COPY, src0, vA, 16); \
read_fun(vB,in1,coord,VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,0));\
_viv_asm(COPY, src1, vB, 16); \
output_type dst; \
dst = (lgc_op2(src0))lgc_op(lgc_op2(src1)); \
vxc_char8 data; \
VXC_DP2x8(data,dst,dst,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero, 1),uniConvertInt16toInt8_2x8); \
data &= 1; \
write_fun(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
#define TENSORLOGICAL_BFP16(name0, src_type_name, dst_type_name, input_type,\
copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \
__kernel void logical_##name0##_##src_type_name##to##dst_type_name( \
__read_only image2d_array_t in0, \
__read_only image2d_array_t in1, \
__write_only image2d_array_t output) \
{\
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\
TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\
VXC_ReadImage2DArray, VXC_WriteImage2DArray) \
}
#define TENSORLOGICAL_BFP16_2D(name0, src_type_name, dst_type_name, input_type,\
copy_type, output_type, out_copy_type, lgc_op, lgc_op2) \
__kernel void logical_##name0##_##src_type_name##to##dst_type_name##_2D( \
__read_only image2d_array_t in0, \
__read_only image2d_array_t in1, \
__write_only image2d_array_t output) \
{\
int2 coord = (int2)(get_global_id(0), get_global_id(1));\
TENSORLOGICAL_BFP_PROCESS(input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2,\
VXC_ReadImage, VXC_WriteImage) \
}
// name0, src_name, dst_name, input_type, copy_type, output_type, out_copy_type, lgc_op, lgc_op2
TENSORLOGICAL(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, )
//TENSORLOGICAL(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, )
@ -100,6 +143,10 @@ TENSORLOGICAL(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8, vx
//TENSORLOGICAL(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)
//TENSORLOGICAL_FP(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)
TENSORLOGICAL_BFP16(or, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )
TENSORLOGICAL_BFP16(and, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, )
TENSORLOGICAL_BFP16(xor, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)
TENSORLOGICAL_2D(or, I8, I8, vxc_char8, vxc_char8, vxc_char8, vxc_char8, ||, )
//TENSORLOGICAL_2D(or, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ||, )
//TENSORLOGICAL_2D(or, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )
@ -112,3 +159,7 @@ TENSORLOGICAL_2D(xor, I8, I8, vxc_char8, vxc_char8, vxc_char8,
//TENSORLOGICAL_2D(xor, U8, U8, vxc_uchar8, vxc_uchar8, vxc_char8, vxc_uchar8, ^, !!)
//TENSORLOGICAL_2D(xor, I16, I16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)
//TENSORLOGICAL_FP_2D(xor, F16, F16, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)
TENSORLOGICAL_BFP16_2D(or, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ||, )
TENSORLOGICAL_BFP16_2D(and, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, &&, )
TENSORLOGICAL_BFP16_2D(xor, BF16, I8, vxc_short8, vxc_short8, vxc_short8, vxc_short8, ^, !!)

View File

@ -0,0 +1,272 @@
#include "cl_viv_vx_ext.h"
_viv_uniform int ac2zero;
_viv_uniform int bc2zero;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
__kernel void gemm_BF16BF16toBF16(image2d_array_t inputA,
image2d_array_t inputB, image2d_array_t output,
int transposeA, int transposeB,
int adjointA, int adjointB, uint M, uint K, uint N)
{
uint gidy = get_global_id(1);
int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);
int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);
vxc_ushort8 valC0, valC1, src0, src1;
vxc_ushort8 srcA0, srcB0, srcA1, srcB1, outC;
vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
int8 inputA_desc, inputB_desc, output_desc;
_viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
_viv_asm(MOV, coord_a.w, baseAddr_a);
_viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
_viv_asm(MOV, coord_b.w, baseAddr_b);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
for(coord_a.x = 0, coord_b.y = 0; coord_a.x < K;)
{
vxc_float4 tempA0, tempA1, tempA2, tempA3;
vxc_float4 tempB0, tempB1, tempB2, tempB3;
VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0));
coord_a.x += 4;
coord_b.y += 4;
VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempA0, src0, 16);
VXC_DP2x8(src1, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, tempA1, src1, 16);
VXC_DP2x8(src0, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempA2, src0, 16);
VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, tempA3, src1, 16);
VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempB0, src0, 16);
VXC_DP2x8(src1, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, tempB1, src1, 16);
VXC_DP2x8(src0, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempB2, src0, 16);
VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, tempB3, src1, 16);
sum0 += (tempA0.x * tempB0 + tempA0.y * tempB1 + tempA0.z * tempB2 + tempA0.w * tempB3);
sum1 += (tempA1.x * tempB0 + tempA1.y * tempB1 + tempA1.z * tempB2 + tempA1.w * tempB3);
sum2 += (tempA2.x * tempB0 + tempA2.y * tempB1 + tempA2.z * tempB2 + tempA2.w * tempB3);
sum3 += (tempA3.x * tempB0 + tempA3.y * tempB1 + tempA3.z * tempB2 + tempA3.w * tempB3);
}
coord_b.y = gidy;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_b.w, baseAddr);
_viv_asm(COPY, valC0, sum0, 16);
_viv_asm(COPY, valC1, sum1, 16);
VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_b.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_b.y++;
_viv_asm(COPY, valC0, sum2, 16);
_viv_asm(COPY, valC1, sum3, 16);
VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_b.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void gemm_transa_BF16BF16toBF16(
image2d_array_t inputA,
image2d_array_t inputB,
image2d_array_t output,
int transposeA,
int transposeB,
int adjointA,
int adjointB,
uint M, uint K, uint N)
{
uint gidy = get_global_id(1);
vxc_ushort8 valC0, valC1;
vxc_ushort8 srcA, srcB, outC, src0, src1;
int4 coord_a = (int4)(gidy, 0, (ac2zero ? 0 : get_global_id(2)), 0);
int4 coord_b = (int4)(get_global_id(0), 0, (bc2zero ? 0 : get_global_id(2)), 0);
vxc_float4 sum0 = (vxc_float4)(0);
vxc_float4 sum1 = (vxc_float4)(0);
vxc_float4 sum2 = (vxc_float4)(0);
vxc_float4 sum3 = (vxc_float4)(0);
int8 inputA_desc, inputB_desc, output_desc;
_viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
_viv_asm(MOV, coord_a.w, baseAddr_a);
_viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
_viv_asm(MOV, coord_b.w, baseAddr_b);
vxc_float4 tempA0;
vxc_float4 tempB0;
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
for(coord_a.y = 0, coord_b.y = 0; coord_a.y < K;)
{
VXC_OP4(img_load_3d, srcA, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcB, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
coord_a.y++;
coord_b.y++;
VXC_DP2x8(src0, srcA, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempA0, src0, 16);
VXC_DP2x8(src1, srcB, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempB0, src1, 16);
sum0 = (sum0 + tempA0.x * tempB0);
sum1 = (sum1 + tempA0.y * tempB0);
sum2 = (sum2 + tempA0.z * tempB0);
sum3 = (sum3 + tempA0.w * tempB0);
}
coord_b.y = gidy;
_viv_asm(COPY, output_desc, output, sizeof(output_desc));
int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
_viv_asm(MOV, coord_b.w, baseAddr);
_viv_asm(COPY, valC0, sum0, 16);
_viv_asm(COPY, valC1, sum1, 16);
VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_b.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_b.y++;
_viv_asm(COPY, valC0, sum2, 16);
_viv_asm(COPY, valC1, sum3, 16);
VXC_DP2x8(outC, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s0123, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
coord_b.y++;
VXC_OP4_NoDest(img_store_3d, output, coord_b.xywz, outC.s4567, \
VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
}
__kernel void gemm_transb_BF16BF16toBF16(image2d_array_t inputA,
image2d_array_t inputB,
image2d_array_t output,
int transposeA,
int transposeB,
int adjointA,
int adjointB,
uint M, uint K, uint N)
{
int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
vxc_float4 sum0 = (vxc_float4)(0);
vxc_float4 sum1 = (vxc_float4)(0);
vxc_float4 sum2 = (vxc_float4)(0);
vxc_float4 sum3 = (vxc_float4)(0);
int8 inputA_desc, inputB_desc, output_desc;
_viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
_viv_asm(MOV, coord_a.w, baseAddr_a);
_viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
_viv_asm(MOV, coord_b.w, baseAddr_b);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_ushort8 src0, src1;
for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)
{
vxc_ushort8 srcA0,srcA1,srcA2,srcA3;
vxc_ushort8 srcB0,srcB1,srcB2,srcB3;
vxc_float4 tempA0, tempA1, tempA2, tempA3;
vxc_float4 tempB0, tempB1, tempB2, tempB3;
VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
coord_a.x += 4;
coord_b.x += 4;
VXC_DP2x8(src0, srcA0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempA0, src0, 16);
VXC_DP2x8(src1, srcA1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempA1, src1, 16);
VXC_DP2x8(src0, srcA2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempA2, src0, 16);
VXC_DP2x8(src1, srcA3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempA3, src1, 16);
VXC_DP2x8(src0, srcB0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempB0, src0, 16);
VXC_DP2x8(src1, srcB1, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempB1, src1, 16);
VXC_DP2x8(src0, srcB2, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempB2, src0, 16);
VXC_DP2x8(src1, srcB3, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, tempB3, src1, 16);
sum0 += (float4)(dot(tempA0, tempB0), dot(tempA0, tempB1), dot(tempA0, tempB2), dot(tempA0, tempB3));
sum1 += (float4)(dot(tempA1, tempB0), dot(tempA1, tempB1), dot(tempA1, tempB2), dot(tempA1, tempB3));
sum2 += (float4)(dot(tempA2, tempB0), dot(tempA2, tempB1), dot(tempA2, tempB2), dot(tempA2, tempB3));
sum3 += (float4)(dot(tempA3, tempB0), dot(tempA3, tempB1), dot(tempA3, tempB2), dot(tempA3, tempB3));
}
vxc_ushort8 valC0, valC1, valDst;
_viv_asm(COPY, valC0, sum0, 16);
_viv_asm(COPY, valC1, sum1, 16);
VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
_viv_asm(COPY, valC0, sum2, 16);
_viv_asm(COPY, valC1, sum3, 16);
VXC_DP2x8(valDst, valC0, valC1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
coord_out.y++;
VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}

View File

@ -11,6 +11,9 @@ _viv_uniform int ac2zero;
_viv_uniform int bc2zero;
_viv_uniform VXC_512Bits uniGemmU8F16toF32Lo_4x4b;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
#if (VX_VERSION==2)
__kernel void gemm_F16F16toF16(image2d_array_t inputA,
@ -192,14 +195,9 @@ __kernel void gemm_F16F16toF16(image2d_array_t inputA,
}
#endif
__kernel void gemm_F32F32toF32(image2d_array_t inputA,
image2d_array_t inputB,
image2d_array_t output,
int transposeA,
int transposeB,
int adjointA,
int adjointB,
uint M, uint K, uint N)
__kernel void gemm_F32F32toF32(
image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output,
int transposeA, int transposeB, int adjointA, int adjointB, uint M, uint K, uint N)
{
uint gidx = get_global_id(0);
uint gidy = get_global_id(1);
@ -207,10 +205,8 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,
int4 coord_a = (int4)(0, gidy, (ac2zero ? 0 : get_global_id(2)), 0);
int4 coord_b = (int4)(gidx, 0, (bc2zero ? 0 : get_global_id(2)), 0);
vxc_float4 sum0 = (vxc_float4)(0);
vxc_float4 sum1 = (vxc_float4)(0);
vxc_float4 sum2 = (vxc_float4)(0);
vxc_float4 sum3 = (vxc_float4)(0);
vxc_float4 sum0 = (vxc_float4)(0), sum1 = (vxc_float4)(0);
vxc_float4 sum2 = (vxc_float4)(0), sum3 = (vxc_float4)(0);
vxc_int4 tmpOut0, tmpOut1;
vxc_uchar16 outC;
@ -224,7 +220,6 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,
coord_a.x = i;
coord_a.y = gidy;
coord_b.x = gidx;
coord_b.y = i;
@ -257,4 +252,4 @@ __kernel void gemm_F32F32toF32(image2d_array_t inputA,
write_imagef(output, coord_b, sum2);
coord_b.y++;
write_imagef(output, coord_b, sum3);
}
}

View File

@ -222,6 +222,62 @@ __kernel void maximum_U8U8toU8_2D
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
}
__kernel void maximum_U8U8toI16
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_uchar16 src0, src1;
VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
vxc_short8 dst0, dst1, dst;
vxc_ushort8 mp0, mp1;
_viv_asm(COPY, mp0, multAndoutZP0, 16);
_viv_asm(COPY, mp1, multAndoutZP1, 16);
VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift0_Lo_2x8);
VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift1_Lo_2x8);
dst = max(dst0, dst1);
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
__kernel void maximum_U8U8toI16_2D
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
vxc_uchar16 src0, src1;
VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
vxc_short8 dst0, dst1, dst;
vxc_ushort8 mp0, mp1;
_viv_asm(COPY, mp0, multAndoutZP0, 16);
_viv_asm(COPY, mp1, multAndoutZP1, 16);
VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift0_Lo_2x8);
VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift1_Lo_2x8);
dst = max(dst0, dst1);
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;
_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;
__kernel void maximum_I16I16toI16

View File

@ -170,4 +170,64 @@ __kernel void maximum_F16F16toI16_2D
tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
__kernel void maximum_I16I16toU8
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_short8 src0, src1;
VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_uchar16 dst0, dst1, dst;
vxc_ushort8 mp0, mp1;
_viv_asm(COPY, mp0, multAndoutZP0, 16);
_viv_asm(COPY, mp1, multAndoutZP1, 16);
VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift0_Lo_2x8);
VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift1_Lo_2x8);
dst = max(dst0, dst1);
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
__kernel void maximum_I16I16toU8_2D
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
vxc_short8 src0, src1;
VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_uchar16 dst0, dst1, dst;
vxc_ushort8 mp0, mp1;
_viv_asm(COPY, mp0, multAndoutZP0, 16);
_viv_asm(COPY, mp1, multAndoutZP1, 16);
VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift0_Lo_2x8);
VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift1_Lo_2x8);
dst = max(dst0, dst1);
VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -224,6 +224,62 @@ __kernel void minimum_U8U8toU8_2D
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
}
__kernel void minimum_U8U8toI16
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_uchar16 src0, src1;
VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
vxc_short8 dst0, dst1, dst;
vxc_ushort8 mp0, mp1;
_viv_asm(COPY, mp0, multAndoutZP0, 16);
_viv_asm(COPY, mp1, multAndoutZP1, 16);
VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift0_Lo_2x8);
VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift1_Lo_2x8);
dst = min(dst0, dst1);
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
__kernel void minimum_U8U8toI16_2D
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output
)
{
int2 coord = (int2)(get_global_id(0), get_global_id(1));
vxc_uchar16 src0, src1;
VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
vxc_short8 dst0, dst1, dst;
vxc_ushort8 mp0, mp1;
_viv_asm(COPY, mp0, multAndoutZP0, 16);
_viv_asm(COPY, mp1, multAndoutZP1, 16);
VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift0_Lo_2x8);
VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift1_Lo_2x8);
dst = min(dst0, dst1);
VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform VXC_512Bits uniConvertI16toI16_0_2x8;
_viv_uniform VXC_512Bits uniConvertI16toI16_1_2x8;
__kernel void minimum_I16I16toI16

View File

@ -173,5 +173,65 @@ __kernel void minimum_F16F16toI16_2D
tmpDst1 = convert_int4_rte(tmpData1 * outputScale + output_zp);
VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
_viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;
_viv_uniform VXC_512Bits uniU8MulAndPostShift1_Lo_2x8;
_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
__kernel void minimum_I16I16toU8
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
vxc_short8 src0, src1;
VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage2DArray(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_uchar16 dst0, dst1, dst;
vxc_ushort8 mp0, mp1;
_viv_asm(COPY, mp0, multAndoutZP0, 16);
_viv_asm(COPY, mp1, multAndoutZP1, 16);
VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift0_Lo_2x8);
VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift1_Lo_2x8);
dst = min(dst0, dst1);
VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}
__kernel void minimum_I16I16toU8_2D
(
__read_only image2d_array_t input0,
__read_only image2d_array_t input1,
__write_only image2d_array_t output
)
{
int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
vxc_short8 src0, src1;
VXC_ReadImage(src0, input0, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
VXC_ReadImage(src1, input1, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
vxc_uchar16 dst0, dst1, dst;
vxc_ushort8 mp0, mp1;
_viv_asm(COPY, mp0, multAndoutZP0, 16);
_viv_asm(COPY, mp1, multAndoutZP1, 16);
VXC_DP2x8(dst0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift0_Lo_2x8);
VXC_DP2x8(dst1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),\
uniU8MulAndPostShift1_Lo_2x8);
dst = min(dst0, dst1);
VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
}

View File

@ -17,6 +17,9 @@ _viv_uniform float e2InScale;
_viv_uniform float rowSumScale;
_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
#define MOMENTS_AXIS0_QINT(src0_type_name, read0_type) \
__kernel void moments_axis0_##src0_type_name##toF16( \
@ -262,6 +265,88 @@ __kernel void moments_axis0_I16toF16_2D(
VXC_DP2x8(tmpVal, tmpData, tmpData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
_viv_asm(COPY, dst, tmpVal, 16);
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void moments_axis0_BF16toBF16(
image2d_array_t input,
image2d_t output_mean,
image2d_t output_vari,
int axis, int axis_num)
{
int gidy = get_global_id(0);
int gidz = get_global_id(1);
int4 coord = (int4)(0, gidy, gidz, 0);
vxc_ushort8 src0, src1;
vxc_ushort8 val;
vxc_float4 mean_vari0 = (vxc_float4)(0);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
for(coord.x = 0; coord.x < width; coord.x += 8)
{
VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 vec0, vec1;
VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, vec0, src0, 16);
VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, vec1, src1, 16);
mean_vari0.x += dot(vec0, one) + dot(vec1, one);
mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);
}
mean_vari0 *= dimRatio;
mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;
int2 coord_out = (int2)(gidy, gidz);
vxc_short8 dst;
_viv_asm(COPY, src0, mean_vari0, 16);
VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
__kernel void moments_axis0_BF16toBF16_2D(
image2d_t input,
image2d_t output_mean,
image2d_t output_vari,
int axis, int axis_num)
{
int gidy = get_global_id(0);
int2 coord = (int2)(0, gidy);
vxc_ushort8 src0, src1;
vxc_ushort8 val;
vxc_float4 mean_vari0 = (vxc_float4)(0);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
for(coord.x = 0; coord.x < width; coord.x += 8)
{
VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
float4 vec0, vec1;
VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, vec0, src0, 16);
VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, vec1, src1, 16);
mean_vari0.x += dot(vec0, one) + dot(vec1, one);
mean_vari0.y += dot(vec0, vec0) + dot(vec1, vec1);
}
mean_vari0 *= dimRatio;
mean_vari0.s1 = mean_vari0.s1 - mean_vari0.s0 * mean_vari0.s0;
int2 coord_out = (int2)(gidy, 0);
vxc_short8 dst;
_viv_asm(COPY, src0, mean_vari0, 16);
VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}

View File

@ -18,6 +18,9 @@ _viv_uniform float e2InScale;
_viv_uniform float rowSumScale;
_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
#define MOMENTS_AXIS012_QINT(src0_type_name, read0_type) \
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_##src0_type_name##toF16( \
@ -236,4 +239,79 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_I1
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
}
__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void moments_axis012_BF16toBF16(
image2d_array_t input,
image2d_t output_mean,
image2d_t output_vari,
int axis,
int axis_num)
{
int gidx = get_global_id(0) << 3;
int lidx = get_local_id(0);
int4 coord = (int4)(gidx, 0, 0, 0);
vxc_float4 sumsqr;
__local float lcl_sum[16];
__local float lcl_sqr[16];
float tmpSum = 0;
float tmpSqr = 0;
vxc_ushort8 src0, src1;
vxc_ushort8 val;
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
vxc_float4 one = (vxc_float4)(1.0f, 1.0f, 1.0f, 1.0f);
for(coord.z = 0; coord.z < channel; coord.z++)
{
for(coord.x = gidx; coord.x < width; coord.x += 128)
{
for(coord.y = 0; coord.y < height;)
{
VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
coord.y++;
float4 vec0, vec1;
VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, vec0, src0, 16);
VXC_DP2x8(src1, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part1_2x8);
_viv_asm(COPY, vec1, src1, 16);
tmpSum += dot(vec0, one) + dot(vec1, one);
tmpSqr += dot(vec0, vec0) + dot(vec1, vec1);
}
}
}
lcl_sum[lidx] = tmpSum;
lcl_sqr[lidx] = tmpSqr;
barrier(CLK_LOCAL_MEM_FENCE);
int2 coord_out = (int2)(0, 0);
if(lidx == 0)
{
__local float4* tmp_sum = (__local float4*)lcl_sum;
__local float4* tmp_sqr = (__local float4*)lcl_sqr;
float sum = (float)(0);
float sqr = (float)(0);
for(int i = 0; i < 4; i++)
{
sum += dot(tmp_sum[i], one);
sqr += dot(tmp_sqr[i], one);
}
float4 mean_vari;
mean_vari.x = sum * dimRatio;
mean_vari.y = sqr * dimRatio;
mean_vari.y = mean_vari.y - mean_vari.x * mean_vari.x;
vxc_short8 dst;
_viv_asm(COPY, src0, mean_vari, 16);
VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s1023, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
}
}

View File

@ -10,6 +10,8 @@ _viv_uniform float e2InScale;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
#define MOMENTS_AXIS1_QINT(src0_type_name, read0_type) \
__kernel void moments_axis1_##src0_type_name##toF16( \
@ -197,3 +199,85 @@ __kernel void moments_axis1_F16toF16_2D(
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void moments_axis1_BF16toBF16(
image2d_array_t input,
image2d_t output_mean,
image2d_t output_vari,
int axis, int axis_num)
{
int gidx = get_global_id(0);
int gidz = get_global_id(1);
int4 coord = (int4)(gidx, 0, gidz, 0);
vxc_ushort8 src0;
vxc_ushort8 val;
vxc_float4 sum = (vxc_float4)(0);
vxc_float4 sqr = (vxc_float4)(0);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
float4 vec0;
VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, vec0, src0, 16);
sum += vec0;
sqr += (vec0 * vec0);
}
vxc_float4 mean = sum * dimRatio;
vxc_float4 vari = sqr * dimRatio;
vari = vari - mean * mean;
int2 coord_out = (int2)(gidx, gidz);
vxc_short8 tmpdst0, tmpdst1, dst;
_viv_asm(COPY, tmpdst0, mean, 16);
_viv_asm(COPY, tmpdst1, vari, 16);
VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void moments_axis1_BF16toBF16_2D(
image2d_t input,
image2d_t output_mean,
image2d_t output_vari,
int axis, int axis_num)
{
int gidx = get_global_id(0);
int2 coord = (int2)(gidx, 0);
vxc_ushort8 src0;
vxc_ushort8 val;
vxc_float4 sum = (vxc_float4)(0);
vxc_float4 sqr = (vxc_float4)(0);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
for(coord.y = 0; coord.y < height; coord.y++)
{
VXC_ReadImage(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
float4 vec0;
VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, vec0, src0, 16);
sum += vec0;
sqr += (vec0 * vec0);
}
vxc_float4 mean = sum * dimRatio;
vxc_float4 vari = sqr * dimRatio;
vari = vari - mean * mean;
int2 coord_out = (int2)(gidx, 0);
vxc_short8 tmpdst0, tmpdst1, dst;
_viv_asm(COPY, tmpdst0, mean, 16);
_viv_asm(COPY, tmpdst1, vari, 16);
VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}

View File

@ -9,6 +9,8 @@ _viv_uniform float e2InScale;
_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
_viv_uniform VXC_512Bits uniExtractOddData_2x8;
#define MOMENTS_AXIS2_QINT(src0_type_name, read0_type) \
__kernel void moments_axis2_##src0_type_name##toF16( \
@ -95,6 +97,50 @@ __kernel void moments_axis2_F16toF16(
_viv_asm(CONV, tmpVari, vari);
VXC_DP2x8(tmpVal, tmpMean, tmpVari, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
_viv_asm(COPY, dst, tmpVal, 16);
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}
__kernel void moments_axis2_BF16toBF16(
image2d_array_t input,
image2d_t output_mean,
image2d_t output_vari,
int axis,
int axis_num)
{
int gidx = get_global_id(0);
int gidy = get_global_id(1);
int4 coord = (int4)(gidx, gidy, 0, 0);
vxc_ushort8 src0;
vxc_ushort8 val;
vxc_float4 sum = (vxc_float4)(0);
vxc_float4 sqr = (vxc_float4)(0);
vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
for(coord.z = 0; coord.z < channel; coord.z++)
{
VXC_ReadImage2DArray(val, input, coord, VXC_5BITOFFSET_XY(0, 0),
VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
float4 vec0;
VXC_DP2x8(src0, val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
uniConvBF16toF32_Part0_2x8);
_viv_asm(COPY, vec0, src0, 16);
sum += vec0;
sqr += (vec0 * vec0);
}
vxc_float4 mean = sum * dimRatio;
vxc_float4 vari = sqr * dimRatio;
vari = vari - mean * mean;
int2 coord_out = (int2)(gidx, gidy);
vxc_short8 tmpdst0, tmpdst1, dst;
_viv_asm(COPY, tmpdst0, mean, 16);
_viv_asm(COPY, tmpdst1, vari, 16);
VXC_DP2x8(dst, tmpdst0, tmpdst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
VXC_WriteImage(output_mean, coord_out, dst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
}

Some files were not shown because too many files have changed in this diff Show More